In [1]:
%load_ext autoreload
%autoreload 2

In [44]:
import xml.etree.ElementTree as ET
from collections import defaultdict

In [43]:
files = {'01-matthew.xml', '02-mark.xml', '03-luke.xml',
         '04-john.xml', '05-acts.xml', '06-romans.xml',
         '07-1corinthians.xml', '08-2corinthians.xml', '09-galatians.xml',
         '10-ephesians.xml', '11-philippians.xml', '12-colossians.xml',
         '13-1thessalonians.xml', '14-2thessalonians.xml', '15-1timothy.xml',
         '16-2timothy.xml', '17-titus.xml', '18-philemon.xml',
         '19-hebrews.xml', '20-james.xml', '21-1peter.xml',
         '22-2peter.xml', '23-1john.xml', '24-2john.xml',
         '25-3john.xml', '26-jude.xml', '27-revelation.xml'}

xml_directory = "xml/2024-06-12/gnt/"

def parse_and_inventory_xml(xml_file, keyword_attributes, remaining_attributes):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    def process_element(element):
        if element.tag in keyword_tags:
            for attr, value in element.attrib.items():
                keyword_attributes[element.tag][attr].append(value)
        else:
            for attr, value in element.attrib.items():
                remaining_attributes[element.tag][attr].append(value)

        for child in element:
            process_element(child)

    process_element(root)

keyword_attributes = defaultdict(lambda: defaultdict(list))
remaining_attributes = defaultdict(lambda: defaultdict(list))

keyword_tags = ['w', 'wg']

for file in files:
    xml_file = f"{xml_directory}{file}"
    parse_and_inventory_xml(xml_file, keyword_attributes, remaining_attributes)

def replace_numbers_with_N(value):
    return ''.join(['N' if char.isdigit() else char for char in value])

def format_inventory(attr_dict):
    report = ""
    for tag, attrs in attr_dict.items():
        report += f"\n\t<{tag}>\n"
        for attr, values in attrs.items():
            value_count = defaultdict(int)
            for value in values:
                new_value = replace_numbers_with_N(value)
                value_count[new_value] += 1
            
            last_value = None
            last_attr = None
            for val, count in sorted(value_count.items()):
                if val == last_value:
                    if attr == last_attr:
                        report += f"\t               \"              {count}x {val}\n"
                else:
                    if attr == last_attr:
                        report += f"\t               \"              {count}x {val}\n"
                    else:
                        report += f"\t               {attr}=           {count}x {val}\n"
                last_attr = attr
                last_value = val
    return report

inventory_report = "Inventory of tags and attributes in the source XML files.\n"
inventory_report += "Contains the following sections:\n"
inventory_report += "\tKeyword Attributes\n"
inventory_report += "\tRemaining Attributes and Elements\n\n"

inventory_report += "Keyword Attributes\n"
inventory_report += format_inventory(keyword_attributes)

inventory_report += "\nRemaining Attributes and Elements\n"
inventory_report += format_inventory(remaining_attributes)

inventory_file = "report/2024-06-12/elements_complete_v2.txt"
with open(inventory_file, "w", encoding="utf-8") as f:
    f.write(inventory_report)
print(f"Inventário de tags e atributos salvo em: {inventory_file}")

Inventário de tags e atributos salvo em: report/2024-06-12/elements_complete_v2.txt


In [42]:
! diff report/2024-06-12/elements_complete_v1.txt report/2024-06-12/elements_complete_v2.txt

12c12
< 	               "              30152x cl
---
> 	               "              30857x cl
14c14
< 	               "              30911x np
---
> 	               "              33710x np
18,19c18,19
< 	               rule=           67x ADV-ADV
< 	               "              14x ADV-ADV-ADV
---
> 	               rule=           64x ADV-ADV
> 	               "              11x ADV-ADV-ADV
29,30c29,30
< 	               "              24x ADV-ADV-ADV-V
< 	               "              9x ADV-ADV-ADV-V-ADV
---
> 	               "              21x ADV-ADV-ADV-V
> 	               "              8x ADV-ADV-ADV-V-ADV
52c52
< 	               "              6x ADV-ADV-P-VC
---
> 	               "              5x ADV-ADV-P-VC
56,57c56
< 	               "              1x ADV-ADV-S-ADV-P-VC
< 	               "              2x ADV-ADV-S-ADV-V
---
> 	               "              1x ADV-ADV-S-ADV-V
61d59
< 	               "              1x ADV-ADV-S-O-V-ADV
65c63
< 	               "           