In [None]:
import os
import xml.etree.ElementTree as ET
from collections import defaultdict

def copy_en_content(segments_by_lang):
    """Copies English content to other language segments if they are empty."""
    
    if "en" not in segments_by_lang:
        print("No English segments found. Skipping content copying.")
        return segments_by_lang  # No changes if English is missing
    
    en_segments = {seg.attrib["id"]: seg for seg in segments_by_lang["en"]}

    for lang, segments in segments_by_lang.items():
        if lang == "en":
            continue  # Skip English itself

        for seg in segments:
            seg_id = seg.attrib["id"]
            if seg_id in en_segments:
                en_seg = en_segments[seg_id]

                # Copy text and metadata from English if empty
                for en_child, child in zip(en_seg, seg):
                    if child.text is None or child.text.strip() == "":
                        child.text = en_child.text if en_child.text else ""

                    for en_subchild, subchild in zip(en_child, child):
                        if subchild.text is None or subchild.text.strip() == "":
                            subchild.text = en_subchild.text if en_subchild.text else ""

    return segments_by_lang

def split_xml_by_language(input_xml_path, output_folder):
    """Splits an XML file into separate files by language with English as a fallback."""
    
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Parse XML
    tree = ET.parse(input_xml_path)
    root = tree.getroot()

    # Group segments by language
    lang_segments = defaultdict(list)
    for segment in root.findall("segment"):
        lang = segment.attrib["lang"]
        lang_segments[lang].append(segment)

    # Copy English content to other languages if empty
    lang_segments = copy_en_content(lang_segments)

    # Generate separate XML documents for each language
    for lang, segments in lang_segments.items():
        new_root = ET.Element("document")
        for seg in segments:
            new_root.append(seg)

        # Convert to XML string and write to file
        output_file = os.path.join(output_folder, f"{lang}.xml")
        tree = ET.ElementTree(new_root)
        tree.write(output_file, encoding="utf-8", xml_declaration=True)

        print(f"Saved {output_file}")

# Example usage
input_xml_path = "filehandling/files/multi_lingual_xml.xml"  # Replace with actual XML file path
output_folder = "./testing"    # Replace with actual output folder path
split_xml_by_language(input_xml_path, output_folder)


In [1]:
import os
import xml.etree.ElementTree as ET

def merge_xml_files(xml_file_paths, output_file):
    """Merges multiple XML files into a single XML file."""
    
    # Create root element for merged XML
    merged_root = ET.Element("document")

    for file_path in xml_file_paths:
        if not os.path.exists(file_path):
            print(f"Warning: File not found - {file_path}")
            continue
        
        # Parse each XML file
        tree = ET.parse(file_path)
        root = tree.getroot()
        
        # Append all segment elements to merged root
        for segment in root.findall("segment"):
            merged_root.append(segment)

    # Convert to XML string and write to file
    merged_tree = ET.ElementTree(merged_root)
    merged_tree.write(output_file, encoding="utf-8", xml_declaration=True)

    print(f"Merged XML saved to: {output_file}")

# Example usage
xml_files = [
    "testing/en.xml",
    "testing/de.xml",
    "testing/fr.xml",
    "testing/it.xml"
]  # Replace with actual XML file paths

output_xml = "testing/merged.xml"  # Output file path

merge_xml_files(xml_files, output_xml)


Merged XML saved to: testing/merged.xml
