In [89]:
#| default_exp frontend

In [90]:
#| export
from pathlib import Path
from bs4 import BeautifulSoup
from datetime import timedelta
import os

In [91]:
import json

In [92]:
template_path = '../assets/html/template.html'
episode_file = "../data/podcast/lex_ai_stephen_wolfram_1/audio_formatted.mp3"

In [93]:
with open(template_path, 'r') as f:
    doc = BeautifulSoup(f, 'html.parser')

How do I format the transcript data, in order to display each speaker, separated by each topic. Do I keep this hierarchical structure? Or is there a better way of doing it.

The reason against the hierarchical structure is I want the topics to be less proposed, so I don't want them simply separated by divs like they are for speakers. I think that's still okay. I can keep the hierarchical structure, with topics on top, which are in an unobtrusive div, which can simply be highlighted, or navigated to. Actually, I can't. Due to the nature of this, segments will automatically be split from the top-down. So if a topic changes in the middle of a speaker's segment, this will also be split.

This isn't too much of an issue just now, so I will go with it, leaving room for improvement.

How am I going to syncronize the speaker separated transcript, with the topic separated one? By indexing.

I need to index the speaker segments, and use that to insert speaker labels on each of the  topic sentences. This comes with a slight issue, in that the sentence split that was done earlier combined some of the sentences. That shouldn't be an issue though, as the speakers are actually labelled at word-level.

In [94]:
import torch
import numpy as np

In [95]:
with open("../data/podcast/lex_ai_stephen_wolfram_1/transcript.json", "r") as f:
    transcript = json.load(f)

In [96]:
#| export
def format_timestamp(seconds): return str(timedelta(seconds=int(seconds)))

In [97]:
#| export 
def create_overview_summary_div(whole_summary):
    summary_div = doc.new_tag('div', attrs={'class': 'transcript-summary-whole'})
    summary_p = doc.new_tag('p', attrs={'class': 'transcript-summary-whole-paragraph'})
    summary_p.string = whole_summary
    summary_div.append(summary_p)
    return summary_div

In [98]:
#| export
def create_overview_toc_div(topics):
    toc_div = doc.new_tag('div', attrs={'class': 'transcript-toc'})
    toc_list = doc.new_tag('ol', attrs={'class': 'toc-list'})
    for topic in topics:
        list_item = doc.new_tag('li')
        topic_link = doc.new_tag('a', href=f"#topic_{topic['label']}")
        topic_title = doc.new_tag('p', attrs={'class': 'title'})
        topic_title.string = topic['title']
        topic_link.append(topic_title)
        list_item.append(topic_link)
        toc_list.append(list_item)
    toc_div.append(toc_list)
    return toc_div


In [99]:
#| export 
def create_overview_div(transcript):
    overview_div = doc.new_tag('div', attrs={'class': 'transcript-overview'})
    overview_div.append(create_overview_summary_div(transcript['summary']))
    overview_div.append(create_overview_toc_div(transcript['topics']))
    return overview_div

In [100]:
#| export
def create_topic_summary_div(summary):
    summary_div = doc.new_tag('div')
    summary_div['class'] = 'transcript-summary'
    summary_p = doc.new_tag("p")
    summary_p['class'] = 'transcript-summary-paragraph'
    summary_p.string = summary
    summary_div.append(summary_p)
    return summary_div

In [101]:
#| export
def create_info_div(object, group_type, attrs, summary=False):
    info_div = doc.new_tag('div')
    info_div['class'] = 'transcript-info ' + group_type
    info_button = doc.new_tag('button')
    info_button['class'] = 'transcript-info-button ' + group_type
    for a in attrs:
        span = doc.new_tag('span')
        span['class'] = a
        if a == 'start':
            span.string = format_timestamp(object[a])
        elif 'label':
            if not summary:
                span.string = str(object[a])
            else: 
                span.string = str(object[a]+1) + ": " + object['title']
        else:
            span.string = object[a]
        info_button.append(span)
    info_div.append(info_button)
    return info_div

In [102]:
#| export
def create_group_div(group, fields=['label', 'start'], summary=False):
    group_type = group['type']
    group_div = doc.new_tag('div')
    group_div['class'] = 'transcript-' + group_type
    group_div['id'] = str(group['type']) + "_" + str(group['label'])
    group_div.append(create_info_div(group, group_type, fields, summary))
    if summary: group_div.append(create_topic_summary_div(group['summary']))
    return group_div

In [103]:
#| export
def create_paragraph_div(paragraph):
    paragraph_field = doc.new_tag('p')
    paragraph_field['id'] = str(paragraph['label'])
    paragraph_field['class'] = 'transcript-paragraph'
    for word in paragraph['words']:
        span = doc.new_tag('span')
        span['class'] = 'transcript-word'
        span.string = word['word'] + " "
        span['id'] = word['start']
        span['onclick'] = f"setAudioTime({word['start']})"
        paragraph_field.append(span)
    return paragraph_field

In [104]:
#| export
def create_transcript_div(transcript):
    transcript_div = doc.new_tag('div')
    transcript_div['class'] = 'transcript'
    for topic in transcript:
        summary = True if topic['summary'] else False
        topic_div = create_group_div(topic, summary=summary)
        for speech in topic['groups']:
            speech_div = create_group_div(speech)
            text_div = doc.new_tag('div')
            text_div['class'] = 'transcript-text'
            for paragraph in speech['groups']:
                paragraph_div = create_paragraph_div(paragraph)
                text_div.append(paragraph_div)
            speech_div.append(text_div)
            topic_div.append(speech_div)
        transcript_div.append(topic_div)
    return transcript_div

In [105]:
#| export
def create_html(transcript, episode_file, template_path='./assets/html/template.html'):

    with open(template_path, 'r') as f: doc = BeautifulSoup(f, 'html.parser')

    doc_transcript_div = doc.find_all('div', {'class': 'transcript-wrapper'})[0]
    doc_transcript_div.append(create_overview_div(transcript))
    doc_transcript_div.append(create_transcript_div(transcript['topics']))

    with open(Path(episode_file).parent/'output-doc.html', 'w') as f: f.write(str(doc))
    
    return doc

In [106]:
doc = create_html(transcript, episode_file, template_path)

In [107]:
#| hide
from nbdev import nbdev_export
nbdev_export()