In [11]:
import re
import glob
import os
from textwrap import wrap
from striprtf.striprtf import rtf_to_text

def extract_sections_with_logging(rtf_path, output_folder, wrap_width=80):
    with open(rtf_path, 'r') as file:
        rtf_content = file.read()
    text = rtf_to_text(rtf_content)

    # Use a broad regex pattern that captures possible variations in headers
    pattern = re.compile(r'(?:Multiple\s*Choice\s*Questions|Fill[\-–]\s*in[\-–]\s*the[\-–]\s*Blank\s*Questions)\s*:', re.IGNORECASE)
    sections = pattern.split(text)

    if len(sections) < 3:
        # Log unexpected structure
        print(f"Unexpected structure in {rtf_path}. Sections found: {len(sections)}")
        with open(os.path.join(output_folder, "log.txt"), "a") as log_file:
            log_file.write(f"Failed to process {rtf_path}. Insufficient sections identified.\n")
        return

    intro, mcq, fibq = sections[0], sections[1], sections[-1]  # Assuming the first split is intro, second is MCQ, last is FIBQ

    # Wrap text preserving paragraphs
    intro = '\n\n'.join(['\n'.join(wrap(para, width=wrap_width)) for para in intro.strip().split('\n\n')])
    mcq = '\n\n'.join(['\n'.join(wrap(para, width=wrap_width)) for para in mcq.strip().split('\n\n')])
    fibq = '\n\n'.join(['\n'.join(wrap(para, width=wrap_width)) for para in fibq.strip().split('\n\n')])

    base_filename = os.path.splitext(os.path.basename(rtf_path))[0]
    for suffix, content in [("_text.txt", intro), ("_MCQ.txt", mcq), ("_FIBQ.txt", fibq)]:
        output_path = os.path.join(output_folder, base_filename + suffix)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(content)
    print(f"Processed {rtf_path}")

# Folder paths (Updated as per your provided paths)
input_folder = 'C:\\Users\\s224228\\Documents\\Fagpakkeprojekt-Eyes-Modelling\\Text_passges'
output_folder = 'C:\\Users\\s224228\\Documents\\Fagpakkeprojekt-Eyes-Modelling\\txtfiles'

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Process all RTF files in the input folder
for rtf_filename in glob.glob(os.path.join(input_folder, '*.rtf')):
    extract_sections_with_logging(rtf_filename, output_folder)


Processed C:\Users\s224228\Documents\Fagpakkeprojekt-Eyes-Modelling\Text_passges\AI_HC_P01.rtf
Processed C:\Users\s224228\Documents\Fagpakkeprojekt-Eyes-Modelling\Text_passges\AI_HC_P02.rtf
Processed C:\Users\s224228\Documents\Fagpakkeprojekt-Eyes-Modelling\Text_passges\AI_HC_P03.rtf
Processed C:\Users\s224228\Documents\Fagpakkeprojekt-Eyes-Modelling\Text_passges\AI_HC_P04.rtf
Processed C:\Users\s224228\Documents\Fagpakkeprojekt-Eyes-Modelling\Text_passges\AI_HC_P05.rtf
Processed C:\Users\s224228\Documents\Fagpakkeprojekt-Eyes-Modelling\Text_passges\AI_HC_P06.rtf
Unexpected structure in C:\Users\s224228\Documents\Fagpakkeprojekt-Eyes-Modelling\Text_passges\AI_KB_P01.rtf. Sections found: 2
Unexpected structure in C:\Users\s224228\Documents\Fagpakkeprojekt-Eyes-Modelling\Text_passges\AI_KB_P02.rtf. Sections found: 2
Unexpected structure in C:\Users\s224228\Documents\Fagpakkeprojekt-Eyes-Modelling\Text_passges\AI_KB_P03.rtf. Sections found: 2
Unexpected structure in C:\Users\s224228\Docum