In [1]:
#한글변환은 시간이 너무 오래 걸리고(1번코드), PDF 변환(2번코드)은 금방됨.
import os
import re

def reorder_sentences_in_folder(input_folder_path):
    """
    Reorder sentences in all text files within a folder that appear disordered due to incorrect line breaks.
    Handle cases like abbreviations (e.g., "Washington, D.C.", "U.S.", "Mr.") where periods should not split sentences.
    Ensure that names and titles (e.g., "Mr. Yuki Kasumi") remain intact and are not split across lines.
    Ensure phrases like "Here in the U.S." and "Washington, D.C." are not broken into lines.
    Prevent line breaks for decimal numbers (e.g., "3.14").
    Avoid forcing sentence breaks and apply handling globally to prevent similar issues.
    Also removes unwanted tags like "</v>", "<v ->", and "-".
    Creates an output folder named "en_reorder" in the same directory as the input folder.
    Appends "_reorder" to the end of the original file name for the output.
    :param input_folder_path: Path to the folder containing input text files
    """
    try:
        # Define the output folder path
        output_folder_path = os.path.join(os.path.dirname(input_folder_path), "en_reorder1")
        if not os.path.exists(output_folder_path):
            os.makedirs(output_folder_path)

        for filename in os.listdir(input_folder_path):
            if filename.endswith('.txt'):
                input_file_path = os.path.join(input_folder_path, filename)
                # Append "_reorder" to the output file name
                output_file_name = os.path.splitext(filename)[0] + "_reorder.txt"
                output_file_path = os.path.join(output_folder_path, output_file_name)

                with open(input_file_path, 'r', encoding='utf-8') as file:
                    lines = file.readlines()

                # Join lines to reconstruct the text, handling line breaks
                full_text = " ".join(line.strip() for line in lines)

                # Remove unwanted tags like </v>, <v ->, and "-"
                cleaned_text = re.sub(r"<.*?>", "", full_text)
                cleaned_text = re.sub(r"-", "", cleaned_text)

                # Protect abbreviations and special cases by temporarily replacing periods
                abbreviations = [
                    r"\bD\.C\b",
                    r"\bU\.S\b",
                    r"\bMr\.",
                    r"\bMrs\.",
                    r"\bDr\.",
                    r"\bJr\.",
                    r"\bSr\.",
                    r"\bMiss\.",
                    r"\bMs\.",
                ]
                for abbr in abbreviations:
                    cleaned_text = re.sub(abbr, lambda m: m.group(0).replace(".", "<<DOT>>"), cleaned_text)

                # Protect decimal numbers
                cleaned_text = re.sub(r"\b(\d+\.\d+)\b", lambda m: m.group(0).replace(".", "<<DOT>>"), cleaned_text)

                # Ensure titles, names, and phrases are preserved
                patterns_to_protect = [
                    r"Mr\. [A-Za-z]+ [A-Za-z]+",
                    r"Mrs\. [A-Za-z]+ [A-Za-z]+",
                    r"Ms\. [A-Za-z]+ [A-Za-z]+",
                    r"Dr\. [A-Za-z]+ [A-Za-z]+",
                    r"Here in the U\.S",
                    r"Washington, D\.C"
                ]
                for pattern in patterns_to_protect:
                    cleaned_text = re.sub(pattern, lambda m: m.group(0).replace(".", "<<DOT>>"), cleaned_text)

                # Avoid forcing sentence breaks, split only on strong punctuation boundaries
                sentences = []
                temp_sentence = ""
                for char in cleaned_text:
                    temp_sentence += char
                    if char in ['.', '?', '!'] and not temp_sentence.endswith("<<DOT>>"):
                        sentences.append(temp_sentence.strip())
                        temp_sentence = ""

                # Add any remaining text as the last sentence
                if temp_sentence.strip():
                    sentences.append(temp_sentence.strip())

                # Restore protected abbreviations, names, phrases, and decimal numbers
                reordered_text = "\n".join(sentences)
                reordered_text = reordered_text.replace("<<DOT>>", ".")

                # Save the reordered text to a new file
                with open(output_file_path, 'w', encoding='utf-8') as file:
                    file.write(reordered_text)

                print(f"Processed file: {filename} -> {output_file_name}")

    except Exception as e:
        print(f"An error occurred: {e}")

def reorder_sentences_in_folder_korean(input_folder_path):
    """
    Reorder sentences in all text files within a folder for Korean text that appear disordered due to incorrect line breaks.
    Handles Korean-specific punctuation and structure without altering unique phrases.
    Creates an output folder named "kr_reorder" in the same directory as the input folder.
    Appends "_reorder" to the end of the original file name for the output.
    :param input_folder_path: Path to the folder containing input text files
    """
    try:
        # Define the output folder path
        output_folder_path = os.path.join(os.path.dirname(input_folder_path), "kr_reorder1")
        if not os.path.exists(output_folder_path):
            os.makedirs(output_folder_path)

        for filename in os.listdir(input_folder_path):
            if filename.endswith('.txt'):
                input_file_path = os.path.join(input_folder_path, filename)
                # Append "_reorder" to the output file name
                output_file_name = os.path.splitext(filename)[0] + "_reorder.txt"
                output_file_path = os.path.join(output_folder_path, output_file_name)

                with open(input_file_path, 'r', encoding='utf-8') as file:
                    lines = file.readlines()

                # Join lines to reconstruct the text, handling line breaks
                full_text = " ".join(line.strip() for line in lines)

                # Remove unwanted tags like </v>, <v ->, and "-"
                cleaned_text = re.sub(r"<.*?>", "", full_text)
                cleaned_text = re.sub(r"-", "", cleaned_text)

                # Avoid forcing sentence breaks for Korean punctuation (e.g., "다.", "요.")
                sentences = []
                temp_sentence = ""
                for char in cleaned_text:
                    temp_sentence += char
                    if char in ['.', '!', '?'] and re.match(r'.*\b(다|요)\.', temp_sentence.strip()):
                        sentences.append(temp_sentence.strip())
                        temp_sentence = ""

                # Add any remaining text as the last sentence
                if temp_sentence.strip():
                    sentences.append(temp_sentence.strip())

                # Save the reordered text to a new file
                reordered_text = "\n".join(sentences)
                with open(output_file_path, 'w', encoding='utf-8') as file:
                    file.write(reordered_text)

                print(f"Processed file (Korean): {filename} -> {output_file_name}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage for English
input_folder_path_en = r"C:/Temp/en1"
reorder_sentences_in_folder(input_folder_path_en)

# Example usage for Korean
input_folder_path_kr = r"C:/Temp/kr1"
reorder_sentences_in_folder_korean(input_folder_path_kr)


Processed file: videos_2025_january_a-globetrotters-guide-to-tech_en_text_only.txt -> videos_2025_january_a-globetrotters-guide-to-tech_en_text_only_reorder.txt
Processed file: videos_2025_january_ai-in-hollywood-applied-ai-the-good-the-bad-and-the-unknown_en_text_only.txt -> videos_2025_january_ai-in-hollywood-applied-ai-the-good-the-bad-and-the-unknown_en_text_only_reorder.txt
Processed file: videos_2025_january_ai-in-hollywood-the-tech-behind-the-hype_en_text_only.txt -> videos_2025_january_ai-in-hollywood-the-tech-behind-the-hype_en_text_only_reorder.txt
Processed file: videos_2025_january_ai-in-the-smart-home-applications-and-ethics_en_text_only.txt -> videos_2025_january_ai-in-the-smart-home-applications-and-ethics_en_text_only_reorder.txt
Processed file: videos_2025_january_data-breach-protection-and-prevention_en_text_only.txt -> videos_2025_january_data-breach-protection-and-prevention_en_text_only_reorder.txt
Processed file: videos_2025_january_data-collection-privacy-and-why

In [2]:
from fpdf import FPDF
import os

def convert_txt_to_pdf(input_folder_path, output_folder_name, font_path, font_name):
    """
    Converts all .txt files in a folder to .pdf files.
    Saves the PDF files in a specified output folder within the same directory as the input folder.

    :param input_folder_path: Path to the folder containing .txt files.
    :param output_folder_name: Name of the output folder to save .pdf files.
    :param font_path: Path to the TTF font file supporting the required language.
    :param font_name: Name to register the font in FPDF.
    """
    try:
        # Define the output folder path
        output_folder_path = os.path.join(os.path.dirname(input_folder_path), output_folder_name)
        if not os.path.exists(output_folder_path):
            os.makedirs(output_folder_path)

        for filename in os.listdir(input_folder_path):
            if filename.endswith('.txt'):
                input_file_path = os.path.join(input_folder_path, filename)
                output_file_name = os.path.splitext(filename)[0] + ".pdf"
                output_file_path = os.path.join(output_folder_path, output_file_name)

                # Create a PDF instance
                pdf = FPDF()
                pdf.set_auto_page_break(auto=True, margin=15)
                pdf.add_page()

                # Add a Unicode font
                pdf.add_font(font_name, fname=font_path, uni=True)
                pdf.set_font(font_name, size=12)

                # Read the text file content
                with open(input_file_path, 'r', encoding='utf-8') as file:
                    for line in file:
                        pdf.multi_cell(0, 10, line.strip())

                # Save the PDF
                pdf.output(output_file_path)
                print(f"Converted file: {filename} -> {output_file_name}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_folder_path_1 = r"C:/Temp/en_reorder1"
input_folder_path_2 = r"C:/Temp/kr_reorder1"

# Use DejaVuSans for English
convert_txt_to_pdf(input_folder_path_1, "en_pdf_output1", "C:/Users/CTE22-213/Downloads/dejavu-sans-ttf-2.37/dejavu-sans-ttf-2.37/ttf/DejaVuSans.ttf", "DejaVu")

# Use Malgun Gothic for Korean
convert_txt_to_pdf(input_folder_path_2, "kr_pdf_output1", "C:/Windows/Fonts/malgun.ttf", "MalgunGothic")


Converted file: videos_2025_january_a-globetrotters-guide-to-tech_en_text_only_reorder.txt -> videos_2025_january_a-globetrotters-guide-to-tech_en_text_only_reorder.pdf
Converted file: videos_2025_january_ai-in-hollywood-applied-ai-the-good-the-bad-and-the-unknown_en_text_only_reorder.txt -> videos_2025_january_ai-in-hollywood-applied-ai-the-good-the-bad-and-the-unknown_en_text_only_reorder.pdf
Converted file: videos_2025_january_ai-in-hollywood-the-tech-behind-the-hype_en_text_only_reorder.txt -> videos_2025_january_ai-in-hollywood-the-tech-behind-the-hype_en_text_only_reorder.pdf
Converted file: videos_2025_january_ai-in-the-smart-home-applications-and-ethics_en_text_only_reorder.txt -> videos_2025_january_ai-in-the-smart-home-applications-and-ethics_en_text_only_reorder.pdf
Converted file: videos_2025_january_data-breach-protection-and-prevention_en_text_only_reorder.txt -> videos_2025_january_data-breach-protection-and-prevention_en_text_only_reorder.pdf
Converted file: videos_2025