# Parsing masses from Isilo and generating Structured files

## Import Libraries
This cell imports necessary libraries for the notebook. BeautifulSoup is used for parsing HTML content, ElementTree for working with XML data, and other libraries for handling regular expressions and file operations.

In [2]:
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import xml.dom.minidom
import re
import os
import json
from collections import defaultdict

### Using recursive default dicts to have more flexibility

In [3]:
def recursive_defaultdict():
    return defaultdict(recursive_defaultdict)

def defaultdict_to_dict(d):
    if isinstance(d, defaultdict):
        # Convert the defaultdict itself to a dict
        d = dict(d)
        # Recursively apply this conversion
        for key, value in d.items():
            d[key] = defaultdict_to_dict(value)
    return d

## Helper functions

### Function: extract_sections
This function reads an HTML file and extracts different sections of the text based on specific HTML tags. It uses BeautifulSoup for parsing HTML and organizes the sections into a dictionary. This is helpful for further processing and analysis of the text.

In [4]:
def extract_sections(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

    was_h3 = False
    current_mass = ''

    masses_raw_text = {}
    for element in soup.body.find_all():
        if element.get_text() == '\xa0':
            continue
        
        if element.name == 'h3':
            if was_h3 == False:
                current_mass = ''
            was_h3 = True
            current_mass += ' ' + element.get_text()
        elif element.name == 'p':
            if was_h3 == True:
                current_mass = current_mass.strip().replace('\n', ' ')
                masses_raw_text[current_mass] = []
            if current_mass != '':
                masses_raw_text[current_mass].append(element.get_text())
            was_h3 = False

    return masses_raw_text

### Function: get_mass_by_sections
This function processes the raw text of masses, segregating them into different sections based on predefined criteria. It's used for organizing the text data for further processing, making it easier to work with.

In [5]:
def get_mass_by_sections(mass_raw_text, sections):
    mass_by_section = {}
    current_section = ''
    for text in mass_raw_text:
        text = text.replace('\n', ' ')
        is_section_title = False
        for section in sections:
            if section in text:
                is_section_title = True
                current_section = text
                mass_by_section[current_section] = []
        if not is_section_title and current_section != '':
            mass_by_section[current_section].append(text)
    
    return mass_by_section

### Possible sections to be found inside the raw data

In [6]:
possible_sections = [
    "ANTÍFONA DE ENTRADA",
    "ORAÇÃO COLECTA",
    "ORAÇÃO SOBRE AS OBLATAS",
    "ANTÍFONA DA COMUNHÃO",
    "ORAÇÃO DEPOIS DA COMUNHÃO",
    "LEITURA I ",
    "SALMO RESPONSORIAL",
    "ALELUIA",
    "LEITURA II",
    "EVANGELHO"
]

## Scripts for Parsing Files to JSON

### Creating advent dictionary

In [7]:
advent_propers = defaultdict(recursive_defaultdict)

### Weeks 1-3

In [8]:
def create_json_mass_propers(propers_idxs, mass_by_section):

    propers = {}

    sections = list(mass_by_section.keys())
    
    propers_present = []
    for idx in propers_idxs:
        data_from_title = sections[idx].split(' - ')
        name = data_from_title[0].title()
        reference = " - ".join(data_from_title[1:])
        
        if reference == "":
            reference = None

        section_content = mass_by_section[sections[idx]]
        proper_data = {}
        proper_type = None

        if 'Entrada' in name:
            proper_type = 'entrance'
        
        if 'Colecta' in name:
            proper_type = 'collect'
        
        if 'Oblatas' in name:
            proper_type = 'offerings'

        if 'Antífona' in name and 'Comunhão' in name:
            proper_type = 'communion'

        if 'Depois' in name:
            proper_type = 'post-communion'

        proper_data['reference'] = reference
        proper_data['text'] = section_content[0]
        
        if proper_type != None:
            propers[proper_type] = proper_data
        else:
            print("Proper type not recognized")

    return propers

In [9]:
file_paths = [
    "../_old/AdvSem01.htm",
    "../_old/AdvSem02.htm",
    "../_old/AdvSem03.htm",
]

weekdays = ["1", "1","1","1", "2", "3", "4", "5", "6", "7"]
cycles = ["A", "B", "C"]

for i, file_path in enumerate(file_paths):
    masses_raw_text = extract_sections(file_path)
    for i, key in enumerate(list(masses_raw_text.keys())[0:]):
        if i in [1, 2, 3]:
            continue
        mass_by_section = get_mass_by_sections(masses_raw_text[key], possible_sections)
        sections = list(mass_by_section.keys())

        keywords = ["EVANGELHO", "LEITURA", "ALELUIA", "SALMO"]
        propers_idxs = [i for i, element in enumerate(sections) if not any(word in element for word in keywords)]

        propers = create_json_mass_propers(propers_idxs, mass_by_section)

        advent_propers[f'week-{file_path[-6:-4]}'][weekdays[i]] = propers

### Week 4 (Sundays)

In [10]:
season = "advent"

file_paths = [
    "../_old/AdvSem04.htm",
]

for i, file_path in enumerate(file_paths):
    masses_raw_text = extract_sections(file_path)
    for i, key in enumerate(list(masses_raw_text.keys())[:1]):
        if i in [1, 2, 3]:
            continue
        mass_by_section = get_mass_by_sections(masses_raw_text[key], possible_sections)
        sections = list(mass_by_section.keys())

        keywords = ["EVANGELHO", "LEITURA", "ALELUIA", "SALMO"]
        propers_idxs = [i for i, element in enumerate(sections) if not any(word in element for word in keywords)]

        propers = create_json_mass_propers(propers_idxs, mass_by_section)
        print(propers)

        advent_propers[f'week-{file_path[-6:-4]}'][weekdays[i]] = propers

{'entrance': {'reference': 'Is 45, 8', 'text': 'Desça o orvalho do alto dos Céus e as nuvens chovam o Justo. Abra-se a terra e germine o Salvador.'}, 'collect': {'reference': None, 'text': 'Infundi, Senhor, a vossa graça em nossas almas, para que nós, que pela anunciação do Anjo conhecemos a encarnação de Cristo, vosso Filho, pela sua paixão e morte na cruz alcancemos a glória da ressurreição. Por Nosso Senhor Jesus Cristo, vosso Filho, que é Deus convosco na unidade do Espírito Santo.'}, 'offerings': {'reference': None, 'text': 'Aceitai, Senhor, os dons que trazemos ao vosso altar e santificai-os com o mesmo Espírito que, pelo poder da sua graça, fecundou o seio da Virgem Santa Maria. Por Nosso Senhor Jesus Cristo, vosso Filho, que é Deus convosco na unidade do Espírito Santo.'}, 'communion': {'reference': 'cf. Is 7, 14', 'text': 'A Virgem conceberá e dará à luz um filho. O seu nome será Emanuel, Deus-connosco.'}, 'post-communion': {'reference': None, 'text': 'Tendo recebido neste sac

### Week 4 (Specific days)

In [11]:
season = "advent"
month = "december"

file_paths = [
    "../_old/AdvSem04.htm",
]

days = [str(i) for i in range(17, 25)]

for i, file_path in enumerate(file_paths):
    masses_raw_text = extract_sections(file_path)
    for i, key in enumerate(list(masses_raw_text.keys())[4:]):
        mass_by_section = get_mass_by_sections(masses_raw_text[key], possible_sections)
        sections = list(mass_by_section.keys())

        keywords = ["EVANGELHO", "LEITURA", "ALELUIA", "SALMO"]
        propers_idxs = [i for i, element in enumerate(sections) if not any(word in element for word in keywords)]

        propers = create_json_mass_propers(propers_idxs, mass_by_section)
        print(propers)

        advent_propers["december"][days[i]] = propers

{'entrance': {'reference': 'cf. Is 49, 13', 'text': 'Alegrem-se os Céus, exulte a terra: o Senhor visitará o seu povo.'}, 'collect': {'reference': None, 'text': 'Deus, criador e redentor do género humano, que no seio da bem-aventurada Virgem Maria quisestes realizar o grande mistério da encarnação do Verbo, ouvi a nossa oração e concedei que o vosso Filho Unigénito, feito homem como nós, nos torne participantes da sua vida divina. Ele que é Deus convosco na unidade do Espírito Santo.'}, 'offerings': {'reference': None, 'text': 'Santificai, Senhor, os dons da vossa Igreja e pela celebração destes sagrados mistérios dai-nos como alimento o pão do Céu. Por Nosso Senhor Jesus Cristo, vosso Filho, que é Deus convosco na unidade do Espírito Santo.'}, 'communion': {'reference': 'cf. Ageu 2, 8', 'text': 'Eis que vem o desejado de todos os povos e encherá de glória o templo do Senhor.'}, 'post-communion': {'reference': None, 'text': 'Deus omnipotente, que nos alimentais com o pão da vida, conce

In [12]:
file_path = '../_new/pt/advent.json'

# Open the file and load its content as a Python dictionary
with open(file_path, 'r', encoding='utf-8') as file:
    advent_readings = json.load(file)['readings']

In [13]:
advent_readings = defaultdict_to_dict(advent_readings)
advent = {
    'propers': advent_propers,
    'readings': advent_readings
}

output_file_path = f"../_new/pt/advent.json"

with open(output_file_path, 'w', encoding='utf-8') as file:
    json.dump(advent, file, ensure_ascii=False, indent=4)