In [None]:
import cssutils
import json
import unicodedata

from bs4 import BeautifulSoup
from pathlib import Path

In [None]:
INPUT_DIR = Path('../data/transcripts/')
OUTPUT_DIR = Path('../data/clean/')

In [None]:
for input_file in INPUT_DIR.glob("*.html"):
    with open(input_file, 'rt') as fh:
        raw_data = fh.read()
        soup = BeautifulSoup(raw_data, 'html.parser')
    
    css = cssutils.parseString(soup.select('style')[0].encode_contents())
    speaker_classes = set()
    for rule in css:
        if rule.type == rule.STYLE_RULE and rule.selectorText.startswith('.c') and rule.style.fontWeight == '700':
            speaker_classes.add(rule.selectorText[1:])
    
    document_structure = []
    for paragraph in soup.find_all('p'):
        document_structure.append({
            'text':  unicodedata.normalize("NFKD", ' '.join(paragraph.text.strip().split())),
            'is_name': paragraph.find('span', class_=speaker_classes)
        })

    parsed_document = []
    for paragraph in document_structure:
        if paragraph['is_name']:
            parsed_document.append({'speaker': paragraph['text']})
        elif len(parsed_document) == 0:
            parsed_document.append({'speaker': 'N/A', 'text': paragraph['text']})
        else:
            if 'text' not in parsed_document[-1]:
                parsed_document[-1]['text'] = paragraph['text']
            else:
                parsed_document[-1]['text'] += ' ' + paragraph['text']

    with open(OUTPUT_DIR / input_file.name.replace('.html', '.json'), 'wt') as fh:
        json.dump(parsed_document, fh)