In [1]:
#| default_exp frontend

In [91]:
#| export
from pathlib import Path
from bs4 import BeautifulSoup
from datetime import timedelta
import shutil

In [92]:
import json

In [93]:
asset_path = '../assets/html'
episode_file = "../data/podcast/people_i_admire_104_joy_of_maths/audio.mp3"

In [94]:
with open(asset_path+'/template.html', 'r') as f:
    doc = BeautifulSoup(f, 'html.parser')

How do I format the transcript data, in order to display each speaker, separated by each topic. Do I keep this hierarchical structure? Or is there a better way of doing it.

The reason against the hierarchical structure is I want the topics to be less proposed, so I don't want them simply separated by divs like they are for speakers. I think that's still okay. I can keep the hierarchical structure, with topics on top, which are in an unobtrusive div, which can simply be highlighted, or navigated to. Actually, I can't. Due to the nature of this, segments will automatically be split from the top-down. So if a topic changes in the middle of a speaker's segment, this will also be split.

This isn't too much of an issue just now, so I will go with it, leaving room for improvement.

How am I going to syncronize the speaker separated transcript, with the topic separated one? By indexing.

I need to index the speaker segments, and use that to insert speaker labels on each of the  topic sentences. This comes with a slight issue, in that the sentence split that was done earlier combined some of the sentences. That shouldn't be an issue though, as the speakers are actually labelled at word-level.

In [95]:
with open("../data/podcast/people_i_admire_104_joy_of_maths/transcript.json", "r") as f:
    transcript = json.load(f)

In [96]:
transcript['topics'][0].keys()

dict_keys(['type', 'label', 'start', 'end', 'text', 'groups', 'title', 'summary', 'summary_unparsed'])

In [97]:
with open("../data/podcast/people_i_admire_104_joy_of_maths/speaker-ids.json", "r") as f:
    speaker_ids = json.load(f)

In [100]:
#| export
def add_speaker_id_fields(transcript):
    for topic in transcript['topics']:
        for speech in topic['groups']:
            speaker_number = speech['label']
            if speaker_number in transcript['speaker_ids'].keys():
                speech.update(transcript['speaker_ids'][speaker_number])
    return transcript

In [101]:
transcript = add_speaker_id_fields(transcript)

In [102]:
transcript['topics'][0]['groups'][0].keys()

dict_keys(['type', 'label', 'start', 'end', 'text', 'groups', 'name', 'role'])

In [103]:
#| export
def format_timestamp(seconds): return str(timedelta(seconds=int(seconds)))

In [104]:
#| export 
def create_overview_summary_div(whole_summary):
    doc = BeautifulSoup('', 'html.parser')
    summary_div = doc.new_tag('div', attrs={'class': 'transcript-summary-whole'})
    summary_p = doc.new_tag('p', attrs={'class': 'transcript-summary-whole-paragraph'})
    summary_p.string = whole_summary
    summary_div.append(summary_p)
    return summary_div

In [105]:
#| export
def create_overview_toc_div(topics):
    doc = BeautifulSoup('', 'html.parser')
    toc_div = doc.new_tag('div', attrs={'class': 'transcript-toc'})
    toc_list = doc.new_tag('ol', attrs={'class': 'toc-list'})
    for topic in topics:
        list_item = doc.new_tag('li')
        topic_link = doc.new_tag('a', href=f"#topic_{topic['label']}")
        topic_title = doc.new_tag('p', attrs={'class': 'title'})
        topic_title.string = topic['title']
        topic_link.append(topic_title)
        list_item.append(topic_link)
        toc_list.append(list_item)
    toc_div.append(toc_list)
    return toc_div


In [106]:
#| export 
def create_overview_div(transcript):
    doc = BeautifulSoup('', 'html.parser')
    overview_div = doc.new_tag('div', attrs={'class': 'transcript-overview'})
    overview_div.append(create_overview_summary_div(transcript['summary']))
    overview_div.append(create_overview_toc_div(transcript['topics']))
    return overview_div

In [107]:
#| export
def create_topic_summary_div(summary):
    doc = BeautifulSoup('', 'html.parser')
    summary_div = doc.new_tag('div')
    summary_div['class'] = 'transcript-summary'
    summary_p = doc.new_tag("p")
    summary_p['class'] = 'transcript-summary-paragraph'
    summary_p.string = summary
    summary_div.append(summary_p)
    return summary_div

In [108]:
#| export
def create_info_div(obj, attrs, summary=False):
    doc = BeautifulSoup('', 'html.parser')
    info_div = doc.new_tag('div')
    info_div['class'] = 'transcript-info ' + obj['type']
    info_button = doc.new_tag('button')
    info_button['class'] = 'transcript-info-button ' + obj['type']
    for a in attrs:
        if a not in obj: continue
        span = doc.new_tag('span')
        span['class'] = a
        if a == 'start':
            span.string = format_timestamp(obj[a])
        elif 'label':
            if summary:
                span.string = str(obj[a]+1) + ": " + obj['title']
            elif obj['type'] == "speaker":
                speaker_label = ''
                speaker_label += obj['name'] if 'name' in obj else obj['label']
                speaker_label += f" ({obj['role']})" if 'role' in obj else ''
                span.string = speaker_label
            else:
                span.string = str(obj[a])
        else:
            span.string = str(obj[a])
        info_button.append(span)
    info_div.append(info_button)
    return info_div

In [109]:
#| export
def create_group_div(group, fields=['label', 'start'], summary=False):
    doc = BeautifulSoup('', 'html.parser')
    group_type = group['type']
    group_div = doc.new_tag('div')
    group_div['class'] = 'transcript-' + group_type
    group_div['id'] = str(group['type']) + "_" + str(group['label'])
    group_div.append(create_info_div(group, fields, summary))
    if summary: group_div.append(create_topic_summary_div(group['summary']))
    return group_div

In [110]:
#| export
def create_paragraph_div(paragraph):
    doc = BeautifulSoup('', 'html.parser')
    paragraph_field = doc.new_tag('p')
    paragraph_field['id'] = str(paragraph['label'])
    paragraph_field['class'] = 'transcript-paragraph'
    for word in paragraph['words']:
        span = doc.new_tag('span')
        span['class'] = 'transcript-word'
        span.string = word['word'] + " "
        span['id'] = word['start']
        span['onclick'] = f"setAudioTime({word['start']})"
        paragraph_field.append(span)
    return paragraph_field

In [111]:
#| export
def create_transcript_div(transcript):
    doc = BeautifulSoup('', 'html.parser')
    transcript_div = doc.new_tag('div')
    transcript_div['class'] = 'transcript'
    for topic in transcript:
        summary = True if topic['summary'] else False
        topic_div = create_group_div(topic, summary=summary)
        for speech in topic['groups']:
            speech_div = create_group_div(speech)
            text_div = doc.new_tag('div')
            text_div['class'] = 'transcript-text'
            for paragraph in speech['groups']:
                paragraph_div = create_paragraph_div(paragraph)
                text_div.append(paragraph_div)
            speech_div.append(text_div)
            topic_div.append(speech_div)
        transcript_div.append(topic_div)
    return transcript_div

In [112]:
#| export
def write_output(doc, episode_file, asset_path):
    output_path = Path(episode_file).parent/'output'
    if output_path.exists(): shutil.rmtree(output_path)
    shutil.copytree(asset_path, output_path, ignore=shutil.ignore_patterns("template.html"))
    with open(output_path/'output.html', 'w') as f: f.write(str(doc.prettify()))
    return output_path/'output.html'

In [113]:
#| export
def create_html(transcript, episode_file, asset_path='./assets/html'):

    with open(asset_path+'/template.html', 'r') as f: doc = BeautifulSoup(f, 'html.parser')

    if 'speaker_ids' in transcript: transcript = add_speaker_id_fields(transcript)

    doc_transcript_div = doc.find_all('div', {'class': 'transcript-wrapper'})[0]
    doc_transcript_div.append(create_overview_div(transcript))
    doc_transcript_div.append(create_transcript_div(transcript['topics']))

    output_path = write_output(doc, episode_file, asset_path)
    
    return doc

In [114]:
doc = create_html(transcript, episode_file, asset_path)

In [115]:
#| hide
from nbdev import nbdev_export
nbdev_export()