In [15]:
import os
import re

def reorder_sentences_in_folder(input_folder_path):
    """
    Reorder sentences in all text files within a folder that appear disordered due to incorrect line breaks.
    Handle cases like abbreviations (e.g., "Washington, D.C.", "U.S.", "Mr.") where periods should not split sentences.
    Ensure that names and titles (e.g., "Mr. Yuki Kasumi") remain intact and are not split across lines.
    Ensure phrases like "Here in the U.S." and "Washington, D.C." are not broken into lines.
    Prevent line breaks for decimal numbers (e.g., "3.14").
    Avoid forcing sentence breaks and apply handling globally to prevent similar issues.
    Also removes unwanted tags like "</v>", "<v ->", and "-".
    Creates an output folder named "en_reorder" in the same directory as the input folder.
    Appends "_reorder" to the end of the original file name for the output.
    :param input_folder_path: Path to the folder containing input text files
    """
    try:
        # Define the output folder path
        output_folder_path = os.path.join(os.path.dirname(input_folder_path), "en_reorder")
        if not os.path.exists(output_folder_path):
            os.makedirs(output_folder_path)

        for filename in os.listdir(input_folder_path):
            if filename.endswith('.txt'):
                input_file_path = os.path.join(input_folder_path, filename)
                # Append "_reorder" to the output file name
                output_file_name = os.path.splitext(filename)[0] + "_reorder.txt"
                output_file_path = os.path.join(output_folder_path, output_file_name)

                with open(input_file_path, 'r', encoding='utf-8') as file:
                    lines = file.readlines()

                # Join lines to reconstruct the text, handling line breaks
                full_text = " ".join(line.strip() for line in lines)

                # Remove unwanted tags like </v>, <v ->, and "-"
                cleaned_text = re.sub(r"<.*?>", "", full_text)
                cleaned_text = re.sub(r"-", "", cleaned_text)

                # Protect abbreviations and special cases by temporarily replacing periods
                abbreviations = [
                    r"\bD\.C\b",
                    r"\bU\.S\b",
                    r"\bMr\.",
                    r"\bMrs\.",
                    r"\bDr\.",
                    r"\bJr\.",
                    r"\bSr\.",
                    r"\bMiss\.",
                    r"\bMs\.",
                ]
                for abbr in abbreviations:
                    cleaned_text = re.sub(abbr, lambda m: m.group(0).replace(".", "<<DOT>>"), cleaned_text)

                # Protect decimal numbers
                cleaned_text = re.sub(r"\b(\d+\.\d+)\b", lambda m: m.group(0).replace(".", "<<DOT>>"), cleaned_text)

                # Ensure titles, names, and phrases are preserved
                patterns_to_protect = [
                    r"Mr\. [A-Za-z]+ [A-Za-z]+",
                    r"Mrs\. [A-Za-z]+ [A-Za-z]+",
                    r"Ms\. [A-Za-z]+ [A-Za-z]+",
                    r"Dr\. [A-Za-z]+ [A-Za-z]+",
                    r"Here in the U\.S",
                    r"Washington, D\.C"
                ]
                for pattern in patterns_to_protect:
                    cleaned_text = re.sub(pattern, lambda m: m.group(0).replace(".", "<<DOT>>"), cleaned_text)

                # Avoid forcing sentence breaks, split only on strong punctuation boundaries
                sentences = []
                temp_sentence = ""
                for char in cleaned_text:
                    temp_sentence += char
                    if char in ['.', '?', '!'] and not temp_sentence.endswith("<<DOT>>"):
                        sentences.append(temp_sentence.strip())
                        temp_sentence = ""

                # Add any remaining text as the last sentence
                if temp_sentence.strip():
                    sentences.append(temp_sentence.strip())

                # Restore protected abbreviations, names, phrases, and decimal numbers
                reordered_text = "\n".join(sentences)
                reordered_text = reordered_text.replace("<<DOT>>", ".")

                # Save the reordered text to a new file
                with open(output_file_path, 'w', encoding='utf-8') as file:
                    file.write(reordered_text)

                print(f"Processed file: {filename} -> {output_file_name}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
# Provide the path to your input folder
input_folder_path = r"C:/Temp/en"
reorder_sentences_in_folder(input_folder_path)


Processed file: videos_2025_january_2030-the-impact-of-xr-headsets-ar-and-smart-glasses-on-spatial-computing_en_text_only.txt -> videos_2025_january_2030-the-impact-of-xr-headsets-ar-and-smart-glasses-on-spatial-computing_en_text_only_reorder.txt
Processed file: videos_2025_january_accenture-keynote_en_text_only.txt -> videos_2025_january_accenture-keynote_en_text_only_reorder.txt
Processed file: videos_2025_january_advancing-women-s-health-innovations-challenges-and-solutions_en_text_only.txt -> videos_2025_january_advancing-women-s-health-innovations-challenges-and-solutions_en_text_only_reorder.txt
Processed file: videos_2025_january_Automotive-Cybersecurity-Revs-Up_en_text_only.txt -> videos_2025_january_Automotive-Cybersecurity-Revs-Up_en_text_only_reorder.txt
Processed file: videos_2025_january_Autonomous-Vehicles-The-Future-is-Finally-Here_en_text_only.txt -> videos_2025_january_Autonomous-Vehicles-The-Future-is-Finally-Here_en_text_only_reorder.txt
Processed file: videos_2025_j