## Workflow to Generate Data Using ReQUESTA

In [None]:
from pathlib import Path
import re
import pandas as pd
from src.general import get_files_in_directory, read_text_file
from src.workflow import question_generation_workflow

async def  generate_mcqs(subject: str, directory_path: str, fact: int, inference: int, main_idea: int, model: str, output_directory: str) -> pd.DataFrame:
    """
    Generate MCQs from text files in a specified directory and save them to a CSV file.

    Args:
        subject (str): The subject name.
        directory_path (str): Path to the directory containing text files.
        fact (int): Number of fact-based questions to generate.
        inference (int): Number of inference-based questions to generate.
        main_idea (int): Number of main idea-based questions to generate.
        model (str): The model to use for question generation.
        output_directory (str): Directory to save the output CSV file.

    Returns:
        pd.DataFrame: DataFrame containing the generated MCQs.
    """
    # Convert directory paths to Path objects
    directory_path = Path(directory_path)
    output_directory = Path(output_directory)

    # Get files from the specified directory
    files = get_files_in_directory(directory_path)

    # Initialize lists to store data
    subjects = []
    chapter_names = []
    section_names = []
    question_types = []
    mcq_list = []
    mcq_answer_list = []

    total_files = len(files)

    for index, afile in enumerate(files):
        print(f"__________________________________________Processing file {index} of {total_files}: {afile}___________________________________________________")
        # Get chapter and section names from the file name
        file_path = Path(afile)
        relative_file_name = file_path.relative_to(directory_path)
        file_stem = relative_file_name.stem

        try:
            chapter_name, section_name = file_stem.split('.')
        except ValueError:
            raise ValueError(f"Filename format is incorrect: '{file_stem}'. Expected format 'X.Y.txt'")

        # Read the file content
        atext = read_text_file(afile)

        # Generate MCQs using the question generation workflow
        mcqs = await question_generation_workflow(atext, fact=fact, inference=inference, main_idea=main_idea, model=model)
        
        for mcq in mcqs:
            amcq = mcq.get('mcq', 'No MCQ generated')
            amcq_answer = mcq.get('mcq_answer', 'No MCQ answer generated')

            # Remove the 'Q1: ' from the mcq and mcq_answer
            amcq = re.sub(r'^Q\d+: ', '', amcq)
            amcq_answer = re.sub(r'^Q\d+: ', '', amcq_answer)

            question_type = mcq.get("question_type", "unknown")

            # Append the results to the list
            question_types.append(question_type)
            mcq_list.append(amcq)
            mcq_answer_list.append(amcq_answer)

            chapter_names.append(chapter_name)
            section_names.append(section_name)
            subjects.append(subject)

        # Create a DataFrame to store the results
        mcq_data = {
            'Subject': subjects,
            'Chapter': chapter_names,
            'Section': section_names,
            'Question_type': question_types,
            'Question': mcq_list,
            'Answer': mcq_answer_list
        }

        mcq_df = pd.DataFrame(mcq_data)

        # Ensure the output directory exists
        output_directory = Path(output_directory)
        output_directory.mkdir(parents=True, exist_ok=True)
        output_file = output_directory / f'{subject}_mcqs_{file_stem}.csv'

        # Save the DataFrame to a CSV file
        mcq_df.to_csv(output_file, index=False)

        # Initialize lists to store data
        subjects = []
        chapter_names = []
        section_names = []
        question_types = []
        mcq_list = []
        mcq_answer_list = []


    return mcq_df

In [24]:

import re
from pathlib import Path
import pandas as pd
from src.general import get_files_in_directory, read_text_file

def extract_texts(subject: str, directory_path: str, output_directory: str) -> pd.DataFrame:
    """
    Extract texts from files in a specified directory, calculate word count, and save them to a CSV file.

    Args:
        subject (str): The subject name.
        directory_path (str): Path to the directory containing text files.
        output_directory (str): Directory to save the output CSV file.

    Returns:
        pd.DataFrame: DataFrame containing the extracted texts and their word counts.
    """
    # Convert directory paths to Path objects
    directory_path = Path(directory_path)
    output_directory = Path(output_directory)

    # Get files from the specified directory
    files = get_files_in_directory(directory_path)

    # Initialize lists to store data
    subjects = []
    chapter_names = []
    section_names = []
    texts = []
    word_counts = []

    for afile in files:
        # Get chapter and section names from the file name
        file_path = Path(afile)
        relative_file_name = file_path.relative_to(directory_path)
        file_stem = relative_file_name.stem

        try:
            chapter_name, section_name = file_stem.split('.')
        except ValueError:
            raise ValueError(f"Filename format is incorrect: '{file_stem}'. Expected format 'X.Y.txt'")

        # Read the file content
        atext = read_text_file(afile)

        # Replace multiple newline characters with a single space
        atext = re.sub(r'\n+', ' ', atext)

        # Calculate word count
        word_count = len(atext.split())

        # Append the results to the list
        subjects.append(subject)
        chapter_names.append(chapter_name)
        section_names.append(section_name)
        texts.append(atext)
        word_counts.append(word_count)

    # Create a DataFrame to store the results
    text_data = {
        'Subject': subjects,
        'Chapter': chapter_names,
        'Section': section_names,
        'Text': texts,
        'Word Count': word_counts
    }

    text_df = pd.DataFrame(text_data)

    # Ensure the output directory exists
    output_directory.mkdir(parents=True, exist_ok=True)
    output_file = output_directory / f'{subject}_texts.csv'

    # Save the DataFrame to a CSV file
    text_df.to_csv(output_file, index=False)

    return text_df

In [None]:
subject = "Anthropology"
directory_path = '../data/anthropology'
output_directory = '../output/mcqs'

extract_texts(subject, directory_path, output_directory)

In [None]:
subject = "History"
directory_path = '../data/History'
output_directory = '../output/mcqs'


mcq_df = await generate_mcqs(
    subject=subject,
    directory_path=directory_path,
    fact=0,
    inference=0,
    main_idea=1,    
    model='gpt-4o',  
    output_directory=output_directory,
)

In [11]:
from src.general import *

In [12]:
directory_path = '../output/mcqs/all_mcqs'
output_file = '../output/mcqs/all_mcqs.csv'

In [13]:
combine_csv_files(
    directory_path=directory_path,
    output_file=output_file,
)

In [None]:
iwf_file = "../output/mcqs/all_mcqs/IWF_EVAL.csv"
output_clean_file = '../output/mcqs/mcq_IWF_cleaned.csv'
# remove duplicates
remove_duplicates_by_column(
    input_file=iwf_file,
    column_name='Question',
    output_file=output_clean_file
        )

In [None]:
# Create a DataFrame to store the results
mcq_data = {
    'Subject': subjects,
    'Chapter': chapter_names,
    'Section': section_names,
    'Question_type': question_types,
    'Question': mcq_list,
    'Answer': mcq_answer_list
}

mcq_df = pd.DataFrame(mcq_data)

# Ensure the output directory exists
output_directory = Path(output_directory)
output_directory.mkdir(parents=True, exist_ok=True)
output_file = output_directory / f'{subject}_mcqs.csv'

# Save the DataFrame to a CSV file
mcq_df.to_csv(output_file, index=False)

In [48]:
text_df = extract_texts(
    subject=subject,
    directory_path=directory_path,
    output_directory=output_directory,
)

In [None]:
from pathlib import Path
import re
import pandas as pd 
from src.general import *
from src.workflow import question_generation_workflow


# get files from ../data/sample_texts
files = get_files_in_directory(directory_path)
directory_path = Path(directory_path)
subjects = []
chapter_names = []
section_names = []
question_types = []
mcq_list = []
mcq_answer_list = []

for afile in files: 
    # Get chapter and section names from the file name
    file_path = Path(afile)
    # Extract relative file name (e.g., "2.1.txt")
    relative_file_name = file_path.relative_to(directory_path)
    # Remove the .txt suffix
    file_stem = relative_file_name.stem  
    # Extract chapter and section names
    try:
        chapter_name, section_name = file_stem.split('.')
    except ValueError:
        raise ValueError(f"Filename format is incorrect: '{file_stem}'. Expected format 'X.Y.txt'")
    
    # Read the afile content
    atext = read_text_file(afile)

    # Generate MCQs using the question generation workflow
    mcqs = await question_generation_workflow(atext,
                                                fact = 1, 
                                                inference = 1,
                                                main_idea = 1,
                                                model = 'gpt-4o',)
    for mcq in mcqs:
        amcq = mcq.get('mcq', 'No MCQ generated')
        amcq_answer = mcq.get('mcq_answer', 'No MCQ answer generated')

        # Remove the 'Q1: ' from the mcq and mcq_answer
        amcq = re.sub(r'^Q\d+: ', '', amcq)
        amcq_answer = re.sub(r'^Q\d+: ', '', amcq_answer)
        
        question_type = mcq.get("question_type", "unknown")

        # Append the results to the list
        question_types.append(question_type)
        mcq_list.append(amcq)
        mcq_answer_list.append(amcq_answer)

        chapter_names.append(chapter_name)
        section_names.append(section_name)
        subjects.append(subject)


# Create a DataFrame to store the results

mcq_data = {
    'Subject': subjects,
    'Chapter': chapter_names,
    'Section': section_names,
    'Question_type': question_types,
    'Question': mcq_list,
    'Answer': mcq_answer_list
}


mcq_df = pd.DataFrame(mcq_data)
# save the DataFrame to a CSV file

output_directory = Path('../output/mcqs')
output_directory.mkdir(parents=True, exist_ok=True)
output_file = output_directory / f'{subject}_mcqs.csv'

# save output_file as a csv file
mcq_df.to_csv(output_file, index=False)


    



In [None]:
from pathlib import Path
import re
import pandas as pd 
from src.general import *
from src.workflow import question_generation_workflow

subject = "pychology"
directory_path = '../data/sample_texts'
# get files from ../data/sample_texts
files = get_files_in_directory(directory_path)
directory_path = Path(directory_path)
subjects = []
chapter_names = []
section_names = []
texts = []

for afile in files: 
    # Get chapter and section names from the file name
    file_path = Path(afile)
    # Extract relative file name (e.g., "2.1.txt")
    relative_file_name = file_path.relative_to(directory_path)
    # Remove the .txt suffix
    file_stem = relative_file_name.stem  
    # Extract chapter and section names
    try:
        chapter_name, section_name = file_stem.split('.')
    except ValueError:
        raise ValueError(f"Filename format is incorrect: '{file_stem}'. Expected format 'X.Y.txt'")
    
    # Read the afile content
    atext = read_text_file(afile)

    # Append the results to the list
    subjects.append(subject)
    chapter_names.append(chapter_name)  
    section_names.append(section_name)
    texts.append(atext)

# Create a DataFrame to store the results
text_data = {   
    'Subject': subjects,
    'Chapter': chapter_names,
    'Section': section_names,
    'Text': texts
}

text_df = pd.DataFrame(text_data)
# save the DataFrame to a CSV file
output_directory = Path('../output/texts')
output_directory.mkdir(parents=True, exist_ok=True) 
output_file = output_directory / f'{subject}_texts.csv'
# save output_file as a csv file
text_df.to_csv(output_file, index=False)


In [None]:
mcq_df

In [40]:
output_directory = Path('../output/mcqs')
output_directory.mkdir(parents=True, exist_ok=True)
output_file = output_directory / f'{subject}_mcqs.csv'

In [41]:
# save output_file as a csv file
mcq_df.to_csv(output_file, index=False)

In [None]:
files

In [None]:
for afile in files:
    text = read_text_file(afile)
    print (f"Content of {afile}:")
    print(text[:1000])  # Print the first 1000 characters of the file



In [None]:
for afile in files:
    text = read_text_file(afile)
    mcq_list = await question_generation_workflow(text, 
                                            fact = 1,
                                            inference = 1,
                                            main_idea = 1,
                                            model = 'gpt-4o',)
    print(f"Generated MCQs for {afile}:")
    print(mcq_list)

In [None]:
files[0]

In [None]:

files[0] = '../data/sample_texts\\2.1.txt'
directory_path = '../data/sample_texts'

import re  

file_name = re.sub(directory_path + '\\\\', '', files[0])
file_name = re.sub('.txt', '', file_name)
chapter_name = file_name.split('.')[0]
section_name = file_name.split('.')[1] 
print(chapter_name)
print(f"Processing file: {file_name}")

In [None]:
atext = read_text_file(files[0])
print(atext[:1000])  # Print the first 1000 characters of the

In [None]:
mcqs = await question_generation_workflow(atext, 
                                            fact = 1, 
                                            inference = 1,
                                            main_idea = 1,
                                            model = 'gpt-4o',)  

In [None]:
mcqs[0].keys()

In [None]:
mcqs[0].get('mcq', 'No MCQ generated')

In [None]:
mcqs[0].get('mcq', 'No MCQ generated')

In [None]:
mcqs[0].get('mcq_answer', 'No MCQ answer generated')

In [23]:
# A regular expression to remove the 'Q1: ' from the mcq and mcq_answer
import re   
mcqs[0]['mcq'] = re.sub(r'^Q\d+: ', '', mcqs[0]['mcq'])
mcqs[0]['mcq_answer'] = re.sub(r'^Q\d+: ', '', mcqs[0]['mcq_answer'])

In [None]:
mcqs[0]['mcq'] 

In [None]:
mcqs[0]['mcq_answer']