## Workflow to Generate Data Using ReQUESTA

In [10]:
from pathlib import Path
import re
import pandas as pd
from src.general import get_files_in_directory, read_text_file
from src.workflow import question_generation_workflow

async def  generate_mcqs(subject: str, directory_path: str, fact: int, inference: int, main_idea: int, model: str, output_directory: str) -> pd.DataFrame:
    """
    Generate MCQs from text files in a specified directory and save them to a CSV file.

    Args:
        subject (str): The subject name.
        directory_path (str): Path to the directory containing text files.
        fact (int): Number of fact-based questions to generate.
        inference (int): Number of inference-based questions to generate.
        main_idea (int): Number of main idea-based questions to generate.
        model (str): The model to use for question generation.
        output_directory (str): Directory to save the output CSV file.

    Returns:
        pd.DataFrame: DataFrame containing the generated MCQs.
    """
    # Convert directory paths to Path objects
    directory_path = Path(directory_path)
    output_directory = Path(output_directory)

    # Get files from the specified directory
    files = get_files_in_directory(directory_path)

    # Initialize lists to store data
    subjects = []
    chapter_names = []
    section_names = []
    question_types = []
    mcq_list = []
    mcq_answer_list = []

    total_files = len(files)

    for index, afile in enumerate(files):
        print(f"__________________________________________Processing file {index} of {total_files}: {afile}___________________________________________________")
        # Get chapter and section names from the file name
        file_path = Path(afile)
        relative_file_name = file_path.relative_to(directory_path)
        file_stem = relative_file_name.stem

        try:
            chapter_name, section_name = file_stem.split('.')
        except ValueError:
            raise ValueError(f"Filename format is incorrect: '{file_stem}'. Expected format 'X.Y.txt'")

        # Read the file content
        atext = read_text_file(afile)

        # Generate MCQs using the question generation workflow
        mcqs = await question_generation_workflow(atext, fact=fact, inference=inference, main_idea=main_idea, model=model)
        
        for mcq in mcqs:
            amcq = mcq.get('mcq', 'No MCQ generated')
            amcq_answer = mcq.get('mcq_answer', 'No MCQ answer generated')

            # Remove the 'Q1: ' from the mcq and mcq_answer
            amcq = re.sub(r'^Q\d+: ', '', amcq)
            amcq_answer = re.sub(r'^Q\d+: ', '', amcq_answer)

            question_type = mcq.get("question_type", "unknown")

            # Append the results to the list
            question_types.append(question_type)
            mcq_list.append(amcq)
            mcq_answer_list.append(amcq_answer)

            chapter_names.append(chapter_name)
            section_names.append(section_name)
            subjects.append(subject)

        # Create a DataFrame to store the results
        mcq_data = {
            'Subject': subjects,
            'Chapter': chapter_names,
            'Section': section_names,
            'Question_type': question_types,
            'Question': mcq_list,
            'Answer': mcq_answer_list
        }

        mcq_df = pd.DataFrame(mcq_data)

        # Ensure the output directory exists
        output_directory = Path(output_directory)
        output_directory.mkdir(parents=True, exist_ok=True)
        output_file = output_directory / f'{subject}_mcqs_{file_stem}.csv'

        # Save the DataFrame to a CSV file
        mcq_df.to_csv(output_file, index=False)

    return mcq_df

In [None]:

import re
from pathlib import Path
import pandas as pd
from src.general import get_files_in_directory, read_text_file

def extract_texts(subject: str, directory_path: str, output_directory: str) -> pd.DataFrame:
    """
    Extract texts from files in a specified directory, calculate word count, and save them to a CSV file.

    Args:
        subject (str): The subject name.
        directory_path (str): Path to the directory containing text files.
        output_directory (str): Directory to save the output CSV file.

    Returns:
        pd.DataFrame: DataFrame containing the extracted texts and their word counts.
    """
    # Convert directory paths to Path objects
    directory_path = Path(directory_path)
    output_directory = Path(output_directory)

    # Get files from the specified directory
    files = get_files_in_directory(directory_path)

    # Initialize lists to store data
    subjects = []
    chapter_names = []
    section_names = []
    texts = []
    word_counts = []

    for afile in files:
        # Get chapter and section names from the file name
        file_path = Path(afile)
        relative_file_name = file_path.relative_to(directory_path)
        file_stem = relative_file_name.stem

        try:
            chapter_name, section_name = file_stem.split('.')
        except ValueError:
            raise ValueError(f"Filename format is incorrect: '{file_stem}'. Expected format 'X.Y.txt'")

        # Read the file content
        atext = read_text_file(afile)

        # Replace multiple newline characters with a single space
        atext = re.sub(r'\n+', ' ', atext)

        # Calculate word count
        word_count = len(atext.split())

        # Append the results to the list
        subjects.append(subject)
        chapter_names.append(chapter_name)
        section_names.append(section_name)
        texts.append(atext)
        word_counts.append(word_count)

    # Create a DataFrame to store the results
    text_data = {
        'Subject': subjects,
        'Chapter': chapter_names,
        'Section': section_names,
        'Text': texts,
        'Word Count': word_counts
    }

    text_df = pd.DataFrame(text_data)

    # Ensure the output directory exists
    output_directory.mkdir(parents=True, exist_ok=True)
    output_file = output_directory / f'{subject}_texts.csv'

    # Save the DataFrame to a CSV file
    text_df.to_csv(output_file, index=False)

    return text_df

In [11]:
subject = "History"
directory_path = '../data/History'
output_directory = '../output/mcqs'


mcq_df = await generate_mcqs(
    subject=subject,
    directory_path=directory_path,
    fact=0,
    inference=0,
    main_idea=1,    
    model='gpt-4o',  
    output_directory=output_directory,
)

INFO:root:Workflow started at: 2025-07-23 16:55:32.749039
INFO:root:Invocation ID: 0b60415f-84b1-4a62-9a4f-ab21c1dff8af


__________________________________________Processing file 0 of 20: ..\data\History\1.3.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested by the user following the guidelines below. \n\n[Guidelines]\n  Step1: Summarize the text \n  The text is divided into chunks (as indicated by the html tags such as <chunk1></chunk1>). Summarize the content within each chunk, maintaining the c

__________________________________________Processing file 1 of 20: ..\data\History\10.3.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested 

__________________________________________Processing file 2 of 20: ..\data\History\10.4.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested 

__________________________________________Processing file 3 of 20: ..\data\History\11.3.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested by the user following the guidelines below. \n\n[Guidelines]\n  Step1: Summarize 

__________________________________________Processing file 4 of 20: ..\data\History\11.5.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested 

__________________________________________Processing file 5 of 20: ..\data\History\13.2.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested 

__________________________________________Processing file 6 of 20: ..\data\History\13.3.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested 

__________________________________________Processing file 7 of 20: ..\data\History\14.3.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested 

__________________________________________Processing file 8 of 20: ..\data\History\14.4.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested 

__________________________________________Processing file 9 of 20: ..\data\History\2.2.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested 

__________________________________________Processing file 10 of 20: ..\data\History\3.4.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested 

__________________________________________Processing file 11 of 20: ..\data\History\4.3.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested by the user following the guidelines below. \n\n[Guidelines]\n  Step1: Summarize the text \n  The text is divided into chunks (as indicated by the html tags such as <chunk1></chunk1>). Summarize the content within each chunk, maintaining the c

__________________________________________Processing file 12 of 20: ..\data\History\4.4.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested by the user following the guidelines below. \n\n[Guidelines]\n  Step1: Summarize the text \n  The text is divided into chunks (as indicated by the html tags such as <chunk1></chunk1>). Summarize the content within each chunk, maintaining the c

__________________________________________Processing file 13 of 20: ..\data\History\4.5.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested by the user following the guidelines below. \n\n[Guidelines]\n  Step1: Summarize the text \n  The text is divided into chunks (as indicated by the html tags such as <chunk1></chunk1>). Summarize the content within each chunk, maintaining the c

__________________________________________Processing file 14 of 20: ..\data\History\5.4.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question

__________________________________________Processing file 15 of 20: ..\data\History\6.2.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested 

__________________________________________Processing file 16 of 20: ..\data\History\6.3.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested by the user following the guidelines below. \n\n[Guidelines]\n  Step1: Summarize the text \n  The text is divided into chunks (as indicated by the html tags such as <chunk1></chunk1>). Summarize the content within each chunk, maintaining the c

__________________________________________Processing file 17 of 20: ..\data\History\7.1.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at

__________________________________________Processing file 18 of 20: ..\data\History\8.4.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested 

__________________________________________Processing file 19 of 20: ..\data\History\9.4.txt___________________________________________________


INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested by the user following the guidelines below. \n\n[Guidelines]\n  Step1: Summarize 

In [1]:
from src.general import *

In [4]:
directory_path = '../output/mcqs/main_ideas'
output_file = '../output/mcqs/all_main_ideas.csv'

In [5]:
combine_csv_files(
    directory_path=directory_path,
    output_file=output_file,
)

In [None]:
# Create a DataFrame to store the results
mcq_data = {
    'Subject': subjects,
    'Chapter': chapter_names,
    'Section': section_names,
    'Question_type': question_types,
    'Question': mcq_list,
    'Answer': mcq_answer_list
}

mcq_df = pd.DataFrame(mcq_data)

# Ensure the output directory exists
output_directory = Path(output_directory)
output_directory.mkdir(parents=True, exist_ok=True)
output_file = output_directory / f'{subject}_mcqs.csv'

# Save the DataFrame to a CSV file
mcq_df.to_csv(output_file, index=False)

In [48]:
text_df = extract_texts(
    subject=subject,
    directory_path=directory_path,
    output_directory=output_directory,
)

In [None]:
from pathlib import Path
import re
import pandas as pd 
from src.general import *
from src.workflow import question_generation_workflow


# get files from ../data/sample_texts
files = get_files_in_directory(directory_path)
directory_path = Path(directory_path)
subjects = []
chapter_names = []
section_names = []
question_types = []
mcq_list = []
mcq_answer_list = []

for afile in files: 
    # Get chapter and section names from the file name
    file_path = Path(afile)
    # Extract relative file name (e.g., "2.1.txt")
    relative_file_name = file_path.relative_to(directory_path)
    # Remove the .txt suffix
    file_stem = relative_file_name.stem  
    # Extract chapter and section names
    try:
        chapter_name, section_name = file_stem.split('.')
    except ValueError:
        raise ValueError(f"Filename format is incorrect: '{file_stem}'. Expected format 'X.Y.txt'")
    
    # Read the afile content
    atext = read_text_file(afile)

    # Generate MCQs using the question generation workflow
    mcqs = await question_generation_workflow(atext,
                                                fact = 1, 
                                                inference = 1,
                                                main_idea = 1,
                                                model = 'gpt-4o',)
    for mcq in mcqs:
        amcq = mcq.get('mcq', 'No MCQ generated')
        amcq_answer = mcq.get('mcq_answer', 'No MCQ answer generated')

        # Remove the 'Q1: ' from the mcq and mcq_answer
        amcq = re.sub(r'^Q\d+: ', '', amcq)
        amcq_answer = re.sub(r'^Q\d+: ', '', amcq_answer)
        
        question_type = mcq.get("question_type", "unknown")

        # Append the results to the list
        question_types.append(question_type)
        mcq_list.append(amcq)
        mcq_answer_list.append(amcq_answer)

        chapter_names.append(chapter_name)
        section_names.append(section_name)
        subjects.append(subject)


# Create a DataFrame to store the results

mcq_data = {
    'Subject': subjects,
    'Chapter': chapter_names,
    'Section': section_names,
    'Question_type': question_types,
    'Question': mcq_list,
    'Answer': mcq_answer_list
}


mcq_df = pd.DataFrame(mcq_data)
# save the DataFrame to a CSV file

output_directory = Path('../output/mcqs')
output_directory.mkdir(parents=True, exist_ok=True)
output_file = output_directory / f'{subject}_mcqs.csv'

# save output_file as a csv file
mcq_df.to_csv(output_file, index=False)


    



INFO:root:Workflow started at: 2025-07-23 10:51:00.983293
INFO:root:Invocation ID: 4baa08ae-a485-4c3b-9a75-d0e85410b99f
INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experi

In [None]:
from pathlib import Path
import re
import pandas as pd 
from src.general import *
from src.workflow import question_generation_workflow

subject = "pychology"
directory_path = '../data/sample_texts'
# get files from ../data/sample_texts
files = get_files_in_directory(directory_path)
directory_path = Path(directory_path)
subjects = []
chapter_names = []
section_names = []
texts = []

for afile in files: 
    # Get chapter and section names from the file name
    file_path = Path(afile)
    # Extract relative file name (e.g., "2.1.txt")
    relative_file_name = file_path.relative_to(directory_path)
    # Remove the .txt suffix
    file_stem = relative_file_name.stem  
    # Extract chapter and section names
    try:
        chapter_name, section_name = file_stem.split('.')
    except ValueError:
        raise ValueError(f"Filename format is incorrect: '{file_stem}'. Expected format 'X.Y.txt'")
    
    # Read the afile content
    atext = read_text_file(afile)

    # Append the results to the list
    subjects.append(subject)
    chapter_names.append(chapter_name)  
    section_names.append(section_name)
    texts.append(atext)

# Create a DataFrame to store the results
text_data = {   
    'Subject': subjects,
    'Chapter': chapter_names,
    'Section': section_names,
    'Text': texts
}

text_df = pd.DataFrame(text_data)
# save the DataFrame to a CSV file
output_directory = Path('../output/texts')
output_directory.mkdir(parents=True, exist_ok=True) 
output_file = output_directory / f'{subject}_texts.csv'
# save output_file as a csv file
text_df.to_csv(output_file, index=False)


In [39]:
mcq_df

Unnamed: 0,Subject,Chapter,Section,Question_type,Question,Answer
0,pychology,2,1,fact,Which feature of surveys provides participants...,B) Anonymity provided to respondents.
1,pychology,2,1,inference,What challenge does field research face when a...,C) The presence of numerous unmanageable varia...
2,pychology,2,1,main_idea,Which of the following best expresses the main...,B) The passage mainly discusses how sociologis...
3,pychology,2,2,fact,What role do symbols play in communication wit...,B) They aid in understanding experiences by co...
4,pychology,2,2,inference,How does the Sapir-Whorf hypothesis explain th...,B) It suggests that linguistic structures guid...
5,pychology,2,2,main_idea,Which of the following best expresses the main...,"A) The passage explores how cultural elements,..."


In [40]:
output_directory = Path('../output/mcqs')
output_directory.mkdir(parents=True, exist_ok=True)
output_file = output_directory / f'{subject}_mcqs.csv'

In [41]:
# save output_file as a csv file
mcq_df.to_csv(output_file, index=False)

In [9]:
files

['../data/sample_texts\\2.1.txt', '../data/sample_texts\\2.2.txt']

In [None]:
for afile in files:
    text = read_text_file(afile)
    print (f"Content of {afile}:")
    print(text[:1000])  # Print the first 1000 characters of the file



Content of ../data/sample_texts\2.1.txt:
Learning Objectives
By the end of this section, you should be able to:
•	Recall the 6 Steps of the Scientific Method
•	Differentiate between four kinds of research methods: surveys, field research, experiments, and secondary data analysis.
•	Explain the appropriateness of specific research approaches for specific topics.
Sociologists examine the social world, see a problem or interesting pattern, and set out to study it. They use research methods to design a study. Planning the research design is a key step in any sociological study. Sociologists generally choose from widely used methods of social investigation: primary source data collection such as survey, participant observation, ethnography, case study, unobtrusive observations, experiment, and secondary data analysis, or use of existing sources. Every research method comes with plusses and minuses, and the topic of study strongly influences which method or methods are put to use. When you a

en_core_web_sm is already installed.


In [14]:
for afile in files:
    text = read_text_file(afile)
    mcq_list = await question_generation_workflow(text, 
                                            fact = 1,
                                            inference = 1,
                                            main_idea = 1,
                                            model = 'gpt-4o',)
    print(f"Generated MCQs for {afile}:")
    print(mcq_list)

INFO:root:Workflow started at: 2025-07-23 09:37:36.788318
INFO:root:Invocation ID: b6760318-8f75-4a40-a547-cf41dd729fd8
INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): openaipublic.blob.core.windows.net:443
DEBUG:urllib3.connectionpool:https://openaipublic.blob.core.windows.net:443 "GET /encodings/o200k_base.tiktoken HTTP/1.1" 200 3613922
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summ

Generated MCQs for ../data/sample_texts\2.1.txt:
[{'system_prompt': '[ROLE]:\nYou are an experienced college instructor. You are an expert in writing multiple-choice questions to assess students’ understanding of academic texts (e.g., chapters in textbooks, academic articles).\n\n[TASK]:\nYour task is to write a factual multiple-choice question (see DEFINITION) based on a fact provided by the user. \nThis fact will be directly supported by a "source text", which is an excerpt from the academic material. \nA "contextual text" will also be provided to help you understand the broader meaning and useage of the source text. \nFollow the guidelines in [GUIDELINES] to do exactly what you are instructed to do. \nNOTE: The user may provide an already generated question with evaluations, in which case you should revise the provided question based on the guidelines in [GUIDELINES] and the evaluations.\n\n[DEFINITION]\n**What are factual questions?**\nFactual questions are those that have a single

INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.started
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:close.complete
DEBUG:httpcore.connection:connect_tcp.started host='api.openai.com' port=443 local_address=None timeout=5.0 socket_options=None
DEBUG:httpcore.connection:connect_tcp.complete return_value=<httpcore._backends.anyio.AnyIOStream object at 0x000002130FE48A30>
DEBUG:httpcore.connection:start_tls.started ssl_context=<ssl.SSLContext object at 0x000002130D52E740> server_hostname='api.openai.com' timeout=5.0
DEBUG:httpcore.connection:start_tls.complete return_value=<httpcore._b

Generated MCQs for ../data/sample_texts\2.2.txt:
[{'system_prompt': '[ROLE]:\nYou are an experienced college instructor. You are an expert in writing multiple-choice questions to assess students’ understanding of academic texts (e.g., chapters in textbooks, academic articles).\n\n[TASK]:\nYour task is to write a factual multiple-choice question (see DEFINITION) based on a fact provided by the user. \nThis fact will be directly supported by a "source text", which is an excerpt from the academic material. \nA "contextual text" will also be provided to help you understand the broader meaning and useage of the source text. \nFollow the guidelines in [GUIDELINES] to do exactly what you are instructed to do. \nNOTE: The user may provide an already generated question with evaluations, in which case you should revise the provided question based on the guidelines in [GUIDELINES] and the evaluations.\n\n[DEFINITION]\n**What are factual questions?**\nFactual questions are those that have a single

In [15]:
files[0]

'../data/sample_texts\\2.1.txt'

In [None]:

files[0] = '../data/sample_texts\\2.1.txt'
directory_path = '../data/sample_texts'

import re  

file_name = re.sub(directory_path + '\\\\', '', files[0])
file_name = re.sub('.txt', '', file_name)
chapter_name = file_name.split('.')[0]
section_name = file_name.split('.')[1] 
print(chapter_name)
print(f"Processing file: {file_name}")

2
Processing file: 2.1


In [16]:
atext = read_text_file(files[0])
print(atext[:1000])  # Print the first 1000 characters of the

Learning Objectives
By the end of this section, you should be able to:
•	Recall the 6 Steps of the Scientific Method
•	Differentiate between four kinds of research methods: surveys, field research, experiments, and secondary data analysis.
•	Explain the appropriateness of specific research approaches for specific topics.
Sociologists examine the social world, see a problem or interesting pattern, and set out to study it. They use research methods to design a study. Planning the research design is a key step in any sociological study. Sociologists generally choose from widely used methods of social investigation: primary source data collection such as survey, participant observation, ethnography, case study, unobtrusive observations, experiment, and secondary data analysis, or use of existing sources. Every research method comes with plusses and minuses, and the topic of study strongly influences which method or methods are put to use. When you are conducting research think about the be

In [17]:
mcqs = await question_generation_workflow(atext, 
                                            fact = 1, 
                                            inference = 1,
                                            main_idea = 1,
                                            model = 'gpt-4o',)  

INFO:root:Workflow started at: 2025-07-23 09:42:11.978485
INFO:root:Invocation ID: a940adbf-53fa-4b64-9140-e403b535927c
INFO:root:Text is now successfully chunked into smaller parts.
INFO:root:Table 'plan_metadata' created or already exists.
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '[ROLE]:\n  You are an experienced college instructor specializing in developing reading comprehension questions to evaluate students\' understanding of academic texts, such as textbook chapters and academic articles. You excel at identifying and selecting key facts and inferences from these texts for question development.\n\n[TASK]:\n  Your task is to summarize the academic text provided by the user and select the specified number of facts and inferences as requested by the user following the guidelines below. \n\n[Guidelines]\n  Step1: Summarize the text \n  The text is divided into chun

In [36]:
mcqs[0].keys()

dict_keys(['system_prompt', 'user_prompt', 'model', 'completion', 'execution_time', 'input_tokens', 'output_tokens', 'question_type', 'invocation_id', 'chunk', 'attempt', 'mcq', 'mcq_answer', 'timestamp'])

In [20]:
mcqs[0].get('mcq', 'No MCQ generated')

"Q1: What is the main purpose of conducting surveys in sociological research?\n\nA) Surveys are primarily used to increase television program ratings.\n\nB) Surveys are designed to observe real-time social interactions in natural settings.\n\nC) Surveys collect information on people's behaviors and opinions.\n\nD) Surveys focus solely on gathering qualitative data."

In [None]:
mcqs[0].get('mcq', 'No MCQ generated')

In [22]:
mcqs[0].get('mcq_answer', 'No MCQ answer generated')

"Q1: C) Surveys collect information on people's behaviors and opinions."

In [23]:
# A regular expression to remove the 'Q1: ' from the mcq and mcq_answer
import re   
mcqs[0]['mcq'] = re.sub(r'^Q\d+: ', '', mcqs[0]['mcq'])
mcqs[0]['mcq_answer'] = re.sub(r'^Q\d+: ', '', mcqs[0]['mcq_answer'])

In [25]:
mcqs[0]['mcq'] 

"What is the main purpose of conducting surveys in sociological research?\n\nA) Surveys are primarily used to increase television program ratings.\n\nB) Surveys are designed to observe real-time social interactions in natural settings.\n\nC) Surveys collect information on people's behaviors and opinions.\n\nD) Surveys focus solely on gathering qualitative data."

In [26]:
mcqs[0]['mcq_answer']

"C) Surveys collect information on people's behaviors and opinions."

In [None]:
{'system_prompt': '[ROLE]:\nYou are an experienced college instructor. You are an expert in writing multiple-choice questions to assess students’ understanding of academic texts (e.g., chapters in textbooks, academic articles).\n\n[TASK]:\nYou are asked to write a main idea multiple-choice question based on an academic text summary provided by the user. \nFollow the guidelines in [GUIDELINES] to do exactly what you are instructed to do. \nNOTE: The user may provide an already generated question with evaluations, in which case you should revise the provided question based on the guidelines in [GUIDELINES] and the evaluations.\n\n[DEFINITION]\n**What are main idea questions?**\nMain idea questions assess a student\'s ability to identify the central point or primary argument of a text, passage, or section.\nInstead of focusing on specific details, examples, or minor points, these questions target the big picture — what the author is fundamentally trying to convey.\n\n[GUIDELINES]\nStep1: Carefully read the TEXT to understand its main idea.  \nStep2: Write a clear question stem. Typical stems include:\n  - "Which of the following best expresses the main idea of the passage?"\n  - "What is the primary purpose of the text?"\n  - "What is the central argument of the article?"\nStep3: Create the correct answer option. \n  - It should accurately reflect the main idea.\n  - It should be broad enough to cover the whole passage but specific enough to show real understanding.\n  - It should use words that are in the question sparingly.\nStep4: Write three plausible distractors. Good distractors often:\n  - Reflect common misconceptions or misinterpretations of the text.\n  - Sound reasonable to a student who only skimmed or misunderstood the text.\n  - Independent of each other and NOT variants of the correct answer.\n  - May reflect minor details from the text.\n  - Be similar in length, structure, tone, and complexity to the correct answer.\n  - Avoid deterministic words (e.g., solely, completely, always, never, purely, every, only) unless they accurately reflect real-world absolutes\n      Many test-takers are trained to spot absolute terms in options as red flags. Using these words in distractors can weaken the item’s ability to discriminate between high- and low-performing students unless they represent a plausible yet incorrect overstatement. Do not use them simply to make distractors sound wrong or extreme.\n      **Example**: \n      What is the main idea of the passage?\n        A). Urban planning always leads to positive outcomes for all residents.(Unrealistic exaggeration)\n        B). The sole purpose of urban planning is to increase green spaces in cities.(Overgeneralization)  \n        C). Urban planning must completely eliminate inequality to be effective. (Overly absolute)\n        D). Urban planning shapes city life and balances livability, equity, and sustainability. (correct answer)\n      Note: The distractors are easily dismissed by test-wise students, regardless of whether they understood the passage.\nStep5: Check the length of the correct answer. If the answer is longer than all the distractors, adjust it to be more concise while retaining its meaning.\nStep6: Self-critique the question and options to ensure clarity, accuracy, and relevance. \n  - Ensure that the question is clear and unambiguous.\n  - Ensure that the correct answer is clearly the best choice.\n  - Ensure that the distractors are plausible and relevant.\n  - Ensure that all the options are similar in length, structure, tone, and complexity.\n\n\nPrint out your reasoning BEFORE you print out the mutliple-choice main idea question. \n\nIMPORTANT: \n1) Wrap up the multiple-choice main idea question within <QUESTION></QUESTION> tags. The stem SHOULD be included in the <QUESTION> tags. Example:\n<QUESTION>What is the main idea of the passage?\n  \\nA) Remote work has recently become more common because of its benefits to companies.\n  \\nB) Remote work is better than working in an office.\n  \\nC) Most companies now require employees to work remotely.\n  \\nD) Remote work only benefits employees, not companies.</QUESTION>\n2) Wrap up the correct answer within <ANSWER></ANSWER> tags. The answer should contain the letter of the correct answer (A, B, C, or D) and the text of the correct answer. Example:\n<ANSWER>A) Remote work has recently become more common because of its benefits to companies.</ANSWER>\n', 'user_prompt': '<text>\nThe section outlines learning objectives for understanding the geographical and cultural context of West Africa, highlighting the significance of Islam, the diversity of the region, and the social and political structures that dominated pre-colonial West Africa. It describes how Islam spread across the region, influencing trade and political dynamics, and notes the different agricultural practices and social structures of West African societies.This section describes the rise of major African empires such as Ghana, Mali, and Songhay, emphasizing their political and economic power, particularly through control of trade routes. It also covers the role and development of slavery within these empires, illustrating various forms of servitude and highlighting how Islamic influences expanded the slave trade beyond Africa.The text discusses how the European slave trade developed, focusing on the role of Portugal and other European countries in expanding the demand for enslaved Africans. It shows how slavery evolved into a race-based system in the New World, with significant impacts on African societies and a lasting legacy in the Americas.\n</text>', 'model': 'gpt-4o', 'completion': 'Reasoning:\n\nStep 1: After reviewing the provided text, the main idea revolves around the historical and cultural dynamics of pre-colonial and colonial West Africa. The text highlights the role of Islam, trade, political and social structures, and the development of slavery within African empires, and how European interests expanded the slave trade and its effects.\n\nStep 2: Writing the question stem, I will ask about the overall theme or purpose of the passage.\n\nStep 3: The correct answer needs to encapsulate the broad overview of West African history, the rise of empires, the influence of Islam, and the evolution of slavery due to European involvement.\n\nStep 4: Distractors need to misdirect by focusing on isolated details or suggest incorrect interpretations, but remain plausible.\n\nStep 5: Check that the correct answer is not significantly longer than the distractors.\n\nStep 6: Review all components to ensure clarity and accuracy.\n\n<QUESTION>What is the main idea of the passage?\n  \\nA) The achievements of West African rulers in expanding their territories.\n  \\nB) The diverse agricultural practices that sustained West African societies.\n  \\nC) The historical development and importance of West Africa, with a focus on Islam, trade, and the evolution of slavery.\n  \\nD) The exclusive impact of European colonization on West African political systems.</QUESTION>\n\n<ANSWER>C) The historical development and importance of West Africa, with a focus on Islam, trade, and the evolution of slavery.</ANSWER>', 
 'execution_time': '0:00:07.636565', 
 'input_tokens': 1114, 
 'output_tokens': 307, 
 'question_type': 'main_idea', 
 'invocation_id': '403f565a-14c7-47e3-a06c-52d20c5c9b5a', 
 'chunk': '[]', 
 'attempt': 1, 
 'mcq': 'What is the main idea of the passage regarding the historical context of West Africa?', 
 'mcq_answer': 'C) The historical development and importance of West Africa, with a focus on Islam, trade, and the evolution of slavery.', 'timestamp': '2025-07-23T16:50:02.519381'}