In [None]:
# =============================================================================
# ASU LEI Team - Baseline Question Generation
# Research Assistant: Shubham
# Task: Generate baseline MCQs from source texts using GPT-5
# =============================================================================

import pandas as pd
import numpy as np
import os
import json
import asyncio
import nest_asyncio
from pathlib import Path
from datetime import datetime
from openai import AsyncOpenAI
import re
from typing import List, Dict, Tuple, Optional

# Enable nested async for Jupyter
nest_asyncio.apply()

print("ASU LEI Team - Baseline Question Generation")
print("=" * 60)
print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# =============================================================================
# CONFIGURATION
# =============================================================================

# Set up paths
project_root = Path.cwd().parent if 'notebook' in str(Path.cwd()) else Path.cwd()
database_dir = project_root / "database"
source_texts_dir = database_dir  # Adjust if your source texts are in a different location

# OpenAI Configuration
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("WARNING: OPENAI_API_KEY not found in environment variables")
    print("Please set your API key: export OPENAI_API_KEY='your-key'")
    OPENAI_API_KEY = input("Enter your OpenAI API key: ")

client = AsyncOpenAI(api_key=OPENAI_API_KEY)



In [None]:
# =============================================================================
# STEP 1: IMPORT ALL TEXTS INTO CSV WITH UNIQUE IDs
# =============================================================================

def create_source_texts_csv():
    """
    Import all text files from subject folders and create source_texts.csv
    with unique IDs in format: subject_chapter_section
    """
    print("\nSTEP 1: Creating source_texts.csv from downloaded files...")
    
    # Define the subjects and their folders
    subjects = ["Anthropology", "History", "Lifespan Development", "Sociology"]
    
    source_texts_data = []
    
    for subject in subjects:
        subject_folder = source_texts_dir / subject
        
        if not subject_folder.exists():
            print(f"Warning: Folder '{subject}' not found in {source_texts_dir}")
            continue
            
        print(f"Processing {subject} folder...")
        
        # Get all .txt files in the subject folder
        txt_files = list(subject_folder.glob("*.txt"))
        
        for txt_file in txt_files:
            # Extract chapter.section from filename (e.g., "1.2.txt" -> "1", "2")
            filename = txt_file.stem  # removes .txt extension
            
            try:
                # Handle filenames like "1.2", "3.1", "11.2", etc.
                if '.' in filename:
                    chapter, section = filename.split('.', 1)
                else:
                    # Handle single number files
                    chapter = filename
                    section = "1"
                
                # Create textID in format: subject_chapter_section
                subject_clean = subject.lower().replace(" ", "_")
                text_id = f"{subject_clean}_{chapter}_{section}"
                
                # Read the text file with UTF-8 encoding to preserve diacritical marks
                try:
                    with open(txt_file, 'r', encoding='utf-8') as f:
                        text_content = f.read().strip()
                    
                    if text_content:  # Only add non-empty files
                        source_texts_data.append({
                            'textID': text_id,
                            'text': text_content
                        })
                        print(f" Added: {text_id} ({len(text_content)} characters)")
                    else:
                        print(f" Skipped empty file: {txt_file}")
                        
                except UnicodeDecodeError:
                    print(f"Error reading {txt_file}: Encoding issue")
                except Exception as e:
                    print(f"Error reading {txt_file}: {e}")
                    
            except Exception as e:
                print(f"Error processing filename {filename}: {e}")
    
    # Create DataFrame and save to CSV
    if source_texts_data:
        df_source_texts = pd.DataFrame(source_texts_data)
        output_file = database_dir / "source_texts.csv"
        df_source_texts.to_csv(output_file, index=False, encoding='utf-8')
        
        print(f"\nCreated source_texts.csv with {len(source_texts_data)} texts")
        print(f"Saved to: {output_file}")
        
        # Display sample
        print(f"\nSample entries:")
        for i, row in df_source_texts.head(3).iterrows():
            print(f"  {row['textID']}: {row['text'][:100]}...")
            
        return df_source_texts
    else:
        print("No text files found! Please check your folder structure.")
        return None

# Execute Step 1
df_source_texts = create_source_texts_csv()



In [None]:
# =============================================================================
# STEP 2: QUESTION GENERATION WORKFLOW USING GPT-5
# =============================================================================

# The exact prompt from the task specification
QUESTION_GENERATION_PROMPT = """<source>
{source_text}
</source>

[Your Task]:

Generate **five multiple-choice questions** based on the source text above.

- Include **two factual questions**, **two inferential questions**, and **one main idea question**.

### Question Type Definitions:

- **Factual Question**: Requires recall of specific facts or details explicitly stated in the source text.

- **Inferential Question**: Requires the student to go beyond what is directly stated in the text and draw a logical conclusion using textual evidence, reasoning, and sometimes prior knowledge. The answer is *implied*, not explicitly stated.

- **Main Idea Question**: Assesses the ability to identify the central point, primary purpose, or overall message of the text. It should *not* focus on minor details, examples, or secondary arguments.

### Output Format:

Return the response in the following JSON structure:

```json
{{
"question1": {{
"type": "factual",
"question": "<QUESTION>Your first factual question stem with four options:\\nA) ...\\nB) ...\\nC) ...\\nD) ...</QUESTION>",
"answer": "<ANSWER>C) Correct answer text</ANSWER>"
}},
"question2": {{
"type": "factual",
"question": "<QUESTION>Your second factual question stem with four options:\\nA) ...\\nB) ...\\nC) ...\\nD) ...</QUESTION>",
"answer": "<ANSWER>B) Correct answer text</ANSWER>"
}},
"question3": {{
"type": "inferential",
"question": "<QUESTION>Your first inferential question stem with four options:\\nA) ...\\nB) ...\\nC) ...\\nD) ...</QUESTION>",
"answer": "<ANSWER>A) Correct answer text</ANSWER>"
}},
"question4": {{
"type": "inferential",
"question": "<QUESTION>Your second inferential question stem with four options:\\nA) ...\\nB) ...\\nC) ...\\nD) ...</QUESTION>",
"answer": "<ANSWER>D) Correct answer text</ANSWER>"
}},
"question5": {{
"type": "main idea",
"question": "<QUESTION>Your main idea question stem with four options:\\nA) ...\\nB) ...\\nC) ...\\nD) ...</QUESTION>",
"answer": "<ANSWER>C) Correct answer text</ANSWER>"
}}
}}
```

**IMPORTANT** The correct answer letters in the JSON template are just examples. Adjust them as needed."""

async def generate_questions_for_text(text_id: str, text_content: str) -> Dict:
    """
    Generate 5 questions for a given text using GPT-5 (or GPT-4 if GPT-5 not available)
    
    Args:
        text_id (str): Unique identifier for the text
        text_content (str): The source text content
        
    Returns:
        Dict: Generated questions and metadata
    """
    try:
        # Format the prompt with the source text
        formatted_prompt = QUESTION_GENERATION_PROMPT.format(source_text=text_content)
        
        # Make API call to GPT (using gpt-4 as gpt-5 may not be available yet)
        response = await client.chat.completions.create(
            model="gpt-4o",  # Use gpt-4o for now, update to gpt-5 when available
            messages=[
                {"role": "user", "content": formatted_prompt}
            ],
            temperature=0.7,  # Some creativity for question variety
            max_tokens=2000
        )
        
        result_text = response.choices[0].message.content
        
        # Try to extract JSON from the response
        try:
            # Look for JSON block in the response
            json_match = re.search(r'```json\s*(\{.*?\})\s*```', result_text, re.DOTALL)
            if json_match:
                json_str = json_match.group(1)
            else:
                # Try to find JSON without code block markers
                json_match = re.search(r'(\{.*?\})', result_text, re.DOTALL)
                if json_match:
                    json_str = json_match.group(1)
                else:
                    raise ValueError("No JSON found in response")
            
            questions_data = json.loads(json_str)
            
            return {
                'text_id': text_id,
                'success': True,
                'questions': questions_data,
                'raw_response': result_text
            }
            
        except json.JSONDecodeError as e:
            print(f"JSON decode error for {text_id}: {e}")
            return {
                'text_id': text_id,
                'success': False,
                'error': f"JSON decode error: {e}",
                'raw_response': result_text
            }
            
    except Exception as e:
        print(f"API error for {text_id}: {e}")
        return {
            'text_id': text_id,
            'success': False,
            'error': str(e),
            'raw_response': None
        }

async def process_all_texts(df_source_texts: pd.DataFrame) -> List[Dict]:
    """
    Process all texts and generate questions for each
    
    Args:
        df_source_texts (pd.DataFrame): DataFrame with textID and text columns
        
    Returns:
        List[Dict]: Results for all texts
    """
    print(f"\nSTEP 2: Generating questions for {len(df_source_texts)} texts...")
    print("Using baseline prompt (no advanced strategies)")
    
    results = []
    
    for index, row in df_source_texts.iterrows():
        text_id = row['textID']
        text_content = row['text']
        
        print(f"\nProcessing {index + 1}/{len(df_source_texts)}: {text_id}")
        print(f"Text length: {len(text_content)} characters")
        
        # Generate questions
        result = await generate_questions_for_text(text_id, text_content)
        results.append(result)
        
        if result['success']:
            print(f"Successfully generated 5 questions for {text_id}")
        else:
            print(f"Failed to generate questions for {text_id}: {result['error']}")
        
        # Small delay to avoid rate limiting
        await asyncio.sleep(1)
    
    # Summary
    successful = len([r for r in results if r['success']])
    print(f"\nGENERATION SUMMARY:")
    print(f"Total texts processed: {len(results)}")
    print(f"Successful: {successful}")
    print(f"Failed: {len(results) - successful}")
    print(f"Success rate: {successful/len(results)*100:.1f}%")
    
    return results



In [None]:
# =============================================================================
# STEP 3: EXPORT RESULTS TO BASELINE_QUESTIONS.CSV
# =============================================================================

def create_baseline_questions_csv(results: List[Dict]) -> pd.DataFrame:
    """
    Create the baseline_questions.csv file from generation results
    
    Args:
        results (List[Dict]): Results from question generation
        
    Returns:
        pd.DataFrame: The baseline questions dataset
    """
    print(f"\nSTEP 3: Creating baseline_questions.csv...")
    
    baseline_data = []
    
    for result in results:
        if not result['success']:
            print(f"Skipping failed generation for {result['text_id']}")
            continue
            
        text_id = result['text_id']
        questions = result['questions']
        
        # Get the source text
        source_text = df_source_texts[df_source_texts['textID'] == text_id]['text'].iloc[0]
        
        # Process each question (question1 through question5)
        for q_num in range(1, 6):
            q_key = f"question{q_num}"
            
            if q_key in questions:
                q_data = questions[q_key]
                
                baseline_data.append({
                    'textID': text_id,
                    'text': source_text,
                    'baseline_question_type': q_data['type'],
                    'baseline_question': q_data['question'],
                    'baseline_answer': q_data['answer']
                })
    
    # Create DataFrame
    df_baseline = pd.DataFrame(baseline_data)
    
    # Save to CSV
    output_file = database_dir / "baseline_questions.csv"
    df_baseline.to_csv(output_file, index=False, encoding='utf-8')
    
    print(f"Created baseline_questions.csv with {len(baseline_data)} questions")
    print(f"Saved to: {output_file}")
    
    # Display summary by question type
    if not df_baseline.empty:
        type_counts = df_baseline['baseline_question_type'].value_counts()
        print(f"\nQuestion types generated:")
        for q_type, count in type_counts.items():
            print(f"  {q_type}: {count}")
        
        # Show sample questions
        print(f"\nSample questions:")
        for q_type in ['factual', 'inferential', 'main idea']:
            sample = df_baseline[df_baseline['baseline_question_type'] == q_type].head(1)
            if not sample.empty:
                row = sample.iloc[0]
                print(f"\n{q_type.title()} Question Example:")
                print(f"Text: {row['textID']}")
                print(f"Q: {row['baseline_question'][:100]}...")
                print(f"A: {row['baseline_answer']}")
    
    return df_baseline



In [None]:
# =============================================================================
# MAIN EXECUTION
# =============================================================================

async def main():
    """Main execution function"""
    
    if df_source_texts is None or df_source_texts.empty:
        print("Cannot proceed: No source texts loaded")
        return
    
    # Generate questions for all texts
    results = await process_all_texts(df_source_texts)
    
    # Create baseline questions CSV
    df_baseline = create_baseline_questions_csv(results)
    
    # Final summary
    print(f"\nTASK COMPLETION SUMMARY:")
    print(f"Step 1: source_texts.csv created with {len(df_source_texts)} texts")
    print(f"Step 2: Questions generated using baseline prompt")
    print(f"Step 3: baseline_questions.csv created with {len(df_baseline)} questions")
    print(f"\nFiles created in {database_dir}:")
    print(f"  - source_texts.csv")
    print(f"  - baseline_questions.csv")
    print(f"\nCompleted at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Run the main function
await main()