# Parsing masses from Isilo and generating XML

## Import Libraries
This cell imports necessary libraries for the notebook. BeautifulSoup is used for parsing HTML content, ElementTree for working with XML data, and other libraries for handling regular expressions and file operations.

In [54]:
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import xml.dom.minidom
import re
import os

## Helper functions

### Function: extract_sections
This function reads an HTML file and extracts different sections of the text based on specific HTML tags. It uses BeautifulSoup for parsing HTML and organizes the sections into a dictionary. This is helpful for further processing and analysis of the text.

In [55]:
def extract_sections(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

    was_h3 = False
    current_mass = ''

    masses_raw_text = {}
    for element in soup.body.find_all():
        if element.get_text() == '\xa0':
            continue
        
        if element.name == 'h3':
            if was_h3 == False:
                current_mass = ''
            was_h3 = True
            current_mass += ' ' + element.get_text()
        elif element.name == 'p':
            if was_h3 == True:
                current_mass = current_mass.strip().replace('\n', ' ')
                masses_raw_text[current_mass] = []
            if current_mass != '':
                masses_raw_text[current_mass].append(element.get_text())
            was_h3 = False

    return masses_raw_text

### Function: get_mass_by_sections
This function processes the raw text of masses, segregating them into different sections based on predefined criteria. It's used for organizing the text data for further processing, making it easier to work with.

In [56]:
def get_mass_by_sections(mass_raw_text, sections):
    mass_by_section = {}
    current_section = ''
    for text in mass_raw_text:
        text = text.replace('\n', ' ')
        is_section_title = False
        for section in sections:
            if section in text:
                is_section_title = True
                current_section = text
                mass_by_section[current_section] = []
        if not is_section_title and current_section != '':
            mass_by_section[current_section].append(text)
    
    return mass_by_section

### Function: create_xml_mass_readings
This function is responsible for converting organized mass readings into an XML format. It creates XML elements from the given metadata, reading indexes, and mass sections, facilitating the structured representation of data.

In [57]:
def create_xml_mass_readings(mass_metadata, reading_idxs, mass_by_section, filename):
    root = ET.Element("xml")
    mass = ET.SubElement(root, "mass", mass_metadata)

    sections = list(mass_by_section.keys())
    for i, idx in enumerate(reading_idxs):
        
        data_from_title = sections[idx].split(' - ')
        name = data_from_title[0].title()
        if name.split(" ")[0] == "Leitura":
            name = name.split(" ")[0].title() + ' ' + name.split(" ")[1].upper()
        reference = " - ".join(data_from_title[1:])
        
        if reference == "":
            reference = None
        
        section_content = mass_by_section[sections[idx]]

        reading = ET.SubElement(mass, f"reading-{i+1}")
        ET.SubElement(reading, "name").text = name
        ET.SubElement(reading, "reference").text = reference

        if 'Leitura' in name:
            snippet         = section_content[0]
            announcement    = section_content[1]
            text            = section_content[2]
            text = re.sub(r"(Palavra da salvação\.|Palavra do Senhor\.)$", "", text)
            ET.SubElement(reading, "snippet").text = snippet
            ET.SubElement(reading, "announcement").text = announcement
            ET.SubElement(reading, "text").text = text
            ET.SubElement(reading, "endind").text = "Palavra do Senhor."
        
        if 'Evangelho' in name:
            snippet         = section_content[0]
            announcement    = section_content[1]
            text            = section_content[2]
            text = re.sub(r"(Palavra da salvação\.|Palavra do Senhor\.)$", "", text)
            ET.SubElement(reading, "snippet").text = snippet
            ET.SubElement(reading, "announcement").text = announcement
            ET.SubElement(reading, "text").text = text
            ET.SubElement(reading, "endind").text = "Palavra da salvação."

        if 'Aleluia' in name:
            response        = section_content[0].split(': ')[-1]
            text            = section_content[1]
            ET.SubElement(reading, "response").text = response
            ET.SubElement(reading, "text").text = text

        if 'Salmo' in name:
            response_idx = 0
            if section_content[0][0] == '(':
                ET.SubElement(reading, "notice").text = section_content[0]
                response_idx = 1
            response = section_content[response_idx].split(': ')[-1]
            latin_response = section_content[response_idx+1].split(': ')[-1]
            ET.SubElement(reading, "response").text = response
            ET.SubElement(reading, "latin_response").text = latin_response
            if section_content[response_idx+2].split(' ')[0] == 'Ou:':
                ET.SubElement(reading, "alt-response").text = section_content[response_idx+2]
                verses = section_content[response_idx+3::3]
            else:
                verses = section_content[response_idx+2::3]
            for j, verse in enumerate(verses):
                ET.SubElement(reading, f"verse-{j+1}").text = verse

    # Convert the ElementTree to a string
    xml_string = ET.tostring(root, 'utf-8')

    # Use minidom to prettify the XML
    dom = xml.dom.minidom.parseString(xml_string)
    pretty_xml_as_string = dom.toprettyxml(indent="\t")

    output_directory = f"../_xml_data/{mass_metadata['season']}/sem-{mass_metadata['week']}"
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    output_file_path = f"{output_directory}/{filename}.xml"

    # Write the pretty-printed XML to a file
    with open(output_file_path, "w", encoding="UTF-8") as file:
        file.write(pretty_xml_as_string)

### Possible sections to be found inside the raw data

In [58]:
possible_sections = [
    "ANTÍFONA DE ENTRADA",
    "ORAÇÃO COLECTA",
    "ANTÍFONA DA COMUNHÃO",
    "ORAÇÃO SOBRE AS OBLATAS",
    "ORAÇÃO DEPOIS DA COMUNHÃO",
    "LEITURA I ",
    "SALMO RESPONSORIAL",
    "ALELUIA",
    "LEITURA II",
    "EVANGELHO"
]

## Script for Processing Specific Files

### Weeks 1-3

In [59]:
season = "advent"

file_paths = [
    "../_old/AdvSem01.htm",
    "../_old/AdvSem02.htm",
    "../_old/AdvSem03.htm",
]

weekdays = [
    "sunday-A", 
    "sunday-B", 
    "sunday-C", 
    "monday", 
    "tuesday", 
    "wednesday", 
    "thursday", 
    "friday", 
    "saturday"
]

for i, file_path in enumerate(file_paths):
    masses_raw_text = extract_sections(file_path)

    for i, key in enumerate(list(masses_raw_text.keys())[1:]):
        mass_by_section = get_mass_by_sections(masses_raw_text[key], possible_sections)
        sections = list(mass_by_section.keys())

        keywords = ["EVANGELHO", "LEITURA", "ALELUIA", "SALMO"]
        reading_idxs = [i for i, element in enumerate(sections) if any(word in element for word in keywords)]

        mass_metadata = {
            "season": season,
            "week": file_path[-6:-4],
            "weekday": weekdays[i],
        }

        create_xml_mass_readings(mass_metadata, reading_idxs, mass_by_section, weekdays[i])

### Week 4 (Sundays)

In [60]:
season = "advent"

file_paths = [
    "../_old/AdvSem04.htm",
]

weekdays = [
    "sunday-A", 
    "sunday-B", 
    "sunday-C",
]

for i, file_path in enumerate(file_paths):
    masses_raw_text = extract_sections(file_path)

    for i, key in enumerate(list(masses_raw_text.keys())[1:4]):
        mass_by_section = get_mass_by_sections(masses_raw_text[key], possible_sections)
        sections = list(mass_by_section.keys())

        keywords = ["EVANGELHO", "LEITURA", "ALELUIA", "SALMO"]
        reading_idxs = [i for i, element in enumerate(sections) if any(word in element for word in keywords)]

        mass_metadata = {
            "season": season,
            "week": file_path[-6:-4],
            "weekday": weekdays[i],
        }

        create_xml_mass_readings(mass_metadata, reading_idxs, mass_by_section, weekdays[i])

### Week 4 (Specific days)

In [61]:
season = "advent"
month = "december"

file_paths = [
    "../_old/AdvSem04.htm",
]

days = [str(i) for i in range(17, 25)]

for i, file_path in enumerate(file_paths):
    masses_raw_text = extract_sections(file_path)

    for i, key in enumerate(list(masses_raw_text.keys())[4:]):
        mass_by_section = get_mass_by_sections(masses_raw_text[key], possible_sections)
        sections = list(mass_by_section.keys())

        keywords = ["EVANGELHO", "LEITURA", "ALELUIA", "SALMO"]
        reading_idxs = [i for i, element in enumerate(sections) if any(word in element for word in keywords)]

        mass_metadata = {
            "season": season,
            "week": file_path[-6:-4],
            "day": days[i],
            "month": month,
        }

        create_xml_mass_readings(mass_metadata, reading_idxs, mass_by_section, f"{month}-{days[i]}")