# Parsing masses from Isilo and generating Structured files

## Import Libraries
This cell imports necessary libraries for the notebook. BeautifulSoup is used for parsing HTML content, ElementTree for working with XML data, and other libraries for handling regular expressions and file operations.

In [34]:
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import xml.dom.minidom
import re
import os
import json
from collections import defaultdict

### Using recursive default dicts to have more flexibility

In [35]:
def recursive_defaultdict():
    return defaultdict(recursive_defaultdict)

def defaultdict_to_dict(d):
    if isinstance(d, defaultdict):
        # Convert the defaultdict itself to a dict
        d = dict(d)
        # Recursively apply this conversion
        for key, value in d.items():
            d[key] = defaultdict_to_dict(value)
    return d

## Helper functions

### Function: extract_sections
This function reads an HTML file and extracts different sections of the text based on specific HTML tags. It uses BeautifulSoup for parsing HTML and organizes the sections into a dictionary. This is helpful for further processing and analysis of the text.

In [36]:
def extract_sections(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

    was_h3 = False
    current_mass = ''

    masses_raw_text = {}
    for element in soup.body.find_all():
        if element.get_text() == '\xa0':
            continue
        
        if element.name == 'h3':
            if was_h3 == False:
                current_mass = ''
            was_h3 = True
            current_mass += ' ' + element.get_text()
        elif element.name == 'p':
            if was_h3 == True:
                current_mass = current_mass.strip().replace('\n', ' ')
                masses_raw_text[current_mass] = []
            if current_mass != '':
                masses_raw_text[current_mass].append(element.get_text())
            was_h3 = False

    return masses_raw_text

### Function: get_mass_by_sections
This function processes the raw text of masses, segregating them into different sections based on predefined criteria. It's used for organizing the text data for further processing, making it easier to work with.

In [37]:
def get_mass_by_sections(mass_raw_text, sections):
    mass_by_section = {}
    current_section = ''
    for text in mass_raw_text:
        text = text.replace('\n', ' ')
        is_section_title = False
        for section in sections:
            if section in text:
                is_section_title = True
                current_section = text
                mass_by_section[current_section] = []
        if not is_section_title and current_section != '':
            mass_by_section[current_section].append(text)
    
    return mass_by_section

### Function: create_json_mass_readings
This function is responsible for converting organized mass readings into an JSON format. It creates JSON elements from the given metadata, reading indexes, and mass sections, facilitating the structured representation of data.

In [38]:
def create_json_mass_readings(reading_idxs, mass_by_section, filename):

    readings = {}

    sections = list(mass_by_section.keys())
    
    readings_present = []
    for idx in reading_idxs:
        
        data_from_title = sections[idx].split(' - ')
        name = data_from_title[0].title()
        if name.split(" ")[0] == "Leitura":
            name = name.split(" ")[0].title() + ' ' + name.split(" ")[1].upper()
        reference = " - ".join(data_from_title[1:])
        
        if reference == "":
            reference = None
        
        section_content = mass_by_section[sections[idx]]

        reading_data = {}
        reading_type = None

        if 'Leitura' in name:
            reading_type = "reading-" + name.split(" ")[-1]
            if reading_type in readings_present:
                readings_present.append('alt-' + reading_type)
                reading_type = f"alt-{reading_type}-{reading_type.count('alt-' + reading_type) + 1}" 
            else:
                readings_present.append(reading_type)
            reading_data["reference"] = reference
            base_idx = 0
            if section_content[0][0] == "«":
                reading_data["snippet"] = section_content[base_idx]
            else:
                base_idx = -1
            reading_data["announcement"] = section_content[base_idx+1]
            reading_data["text"] = re.sub(r"(Palavra do Senhor\.)$", "", section_content[base_idx+2])
    
        if 'Evangelho' in name:
            reading_type = "gospel"
            reading_data["reference"] = reference
            if section_content[0][0] == "«":
                reading_data["snippet"] = section_content[base_idx]
            else:
                base_idx = -1
            reading_data["snippet"] = section_content[0]
            reading_data["announcement"] = section_content[1]
            reading_data["text"] = re.sub(r"(Palavra da salvação\.)$", "", section_content[2])

        if 'Aleluia' in name:
            reading_type = "aleluia"
            reading_data["reference"] = reference
            reading_data["response"] = ': '.join(section_content[0].split(': ')[1:])
            reading_data["text"] = section_content[1]

        if 'Salmo' in name:
            reading_type = "psalm"
            reading_data["reference"] = reference

            base_idx = 0
            if section_content[0][0] == '(':
                reading_data['notice'] = section_content[0]
                base_idx = 1
            
            reading_data["response"] = ': '.join(section_content[0].split(': ')[1:])
            
            if section_content[base_idx+2].split(' ')[0] == 'Ou:':
                reading_data['alt-response'] = ' '.join(section_content[base_idx+2].split(' ')[1:])
                if len(section_content[base_idx+3:]) % 3 == 1:
                    reading_data['verses'] = section_content[base_idx+4::3]
                else:
                    reading_data['verses'] = section_content[base_idx+3::3]
            else:
                reading_data['verses'] = section_content[base_idx+2::3]

        if reading_type != None:
            readings[reading_type] = reading_data
        else:
            print("Reading type not recognized")

    return readings

### Possible sections to be found inside the raw data

In [39]:
possible_sections = [
    "ANTÍFONA DE ENTRADA",
    "ORAÇÃO COLECTA",
    "ANTÍFONA DA COMUNHÃO",
    "ORAÇÃO SOBRE AS OBLATAS",
    "ORAÇÃO DEPOIS DA COMUNHÃO",
    "LEITURA I ",
    "SALMO RESPONSORIAL",
    "ALELUIA",
    "LEITURA II",
    "EVANGELHO"
]

## Scripts for Parsing Files to JSON

### Creating advent dictionary

In [40]:
advent_readings = defaultdict(recursive_defaultdict)

### Weeks 1-3

In [41]:
file_paths = [
    "../_old/AdvSem01.htm",
    "../_old/AdvSem02.htm",
    "../_old/AdvSem03.htm",
]

weekdays = ["1", "1", "1", "2", "3", "4", "5", "6", "7"]
cycles = ["A", "B", "C"]

for i, file_path in enumerate(file_paths):
    masses_raw_text = extract_sections(file_path)

    for i, key in enumerate(list(masses_raw_text.keys())[1:]):
        mass_by_section = get_mass_by_sections(masses_raw_text[key], possible_sections)
        sections = list(mass_by_section.keys())

        keywords = ["EVANGELHO", "LEITURA", "ALELUIA", "SALMO"]
        reading_idxs = [i for i, element in enumerate(sections) if any(word in element for word in keywords)]

        readings = create_json_mass_readings(reading_idxs, mass_by_section, weekdays[i])
        if weekdays[i] == "1":
            if advent_readings[f'week-{file_path[-6:-4]}'][weekdays[i]] == {}:
                advent_readings[f'week-{file_path[-6:-4]}'][weekdays[i]] = []
            # readings['cycle'] = cycles[i]
            readings = {**{'cycle': cycles[i]}, **readings}
            advent_readings[f'week-{file_path[-6:-4]}'][weekdays[i]].append(readings)
        else:
            advent_readings[f'week-{file_path[-6:-4]}'][weekdays[i]] = readings

### Week 4 (Sundays)

In [42]:
season = "advent"

file_paths = [
    "../_old/AdvSem04.htm",
]

for i, file_path in enumerate(file_paths):
    masses_raw_text = extract_sections(file_path)

    for i, key in enumerate(list(masses_raw_text.keys())[1:4]):
        mass_by_section = get_mass_by_sections(masses_raw_text[key], possible_sections)
        sections = list(mass_by_section.keys())

        keywords = ["EVANGELHO", "LEITURA", "ALELUIA", "SALMO"]
        reading_idxs = [i for i, element in enumerate(sections) if any(word in element for word in keywords)]

        mass_metadata = {
            "season": season,
            "week": file_path[-6:-4],
            "weekday": weekdays[i],
        }

        readings = create_json_mass_readings(reading_idxs, mass_by_section, weekdays[i])
        
        if weekdays[i] == "1":
            if advent_readings[f'week-{file_path[-6:-4]}'][weekdays[i]] == {}:
                advent_readings[f'week-{file_path[-6:-4]}'][weekdays[i]] = []
            readings = {**{'cycle': cycles[i]}, **readings}
            advent_readings[f'week-{file_path[-6:-4]}'][weekdays[i]].append(readings)
        else:
            advent_readings[f'week-{file_path[-6:-4]}'][weekdays[i]] = readings

### Week 4 (Specific days)

In [43]:
season = "advent"
month = "december"

file_paths = [
    "../_old/AdvSem04.htm",
]

days = [str(i) for i in range(17, 25)]

for i, file_path in enumerate(file_paths):
    masses_raw_text = extract_sections(file_path)

    for i, key in enumerate(list(masses_raw_text.keys())[4:]):
        mass_by_section = get_mass_by_sections(masses_raw_text[key], possible_sections)
        sections = list(mass_by_section.keys())

        keywords = ["EVANGELHO", "LEITURA", "ALELUIA", "SALMO"]
        reading_idxs = [i for i, element in enumerate(sections) if any(word in element for word in keywords)]

        mass_metadata = {
            "season": season,
            "week": file_path[-6:-4],
            "day": days[i],
            "month": month,
        }

        readings = create_json_mass_readings(reading_idxs, mass_by_section, weekdays[i])
        
        advent_readings["december"][days[i]] = readings

In [None]:
advent_readings = defaultdict_to_dict(advent_readings)
advent = {'readings': advent_readings}

output_file_path = f"../_new/advent.json"

with open(output_file_path, 'w', encoding='utf-8') as file:
    json.dump(advent, file, ensure_ascii=False, indent=4)