# Prodigy Data Creation
This notebook creates the input files necessary for loading and labeling in Prodigy. In here we:
1. Split the input mp3s into one second clips
2. Fill an HTML template with the clip location and clip metadata.
3. Save the clips as mp3s, and the filled templates as JSONL for loading in PRodigy.

In [1]:
from pathlib import Path

import jsonlines
import numpy as np
import pydub

In [2]:
def split_audio(segment, episode):
    """Takes an episode segment and splits it into 1s chunks"""
    slices_1s = segment[::1000]
    for i, chunk in enumerate(slices_1s):
        second_zfill = str(i).zfill(4)
        try:
            with open(f"syntax-clips/syntax{episode}-{second_zfill}.mp3", "xb") as f:
                chunk.export(f, format="mp3")
        except FileExistsError:
            pass


html_template = "<audio controls autoplay loop><source src=\"http://localhost:9999/{0}\" type=\"audio/mp3\"></audio><p>Episode: {1}, Second: {2}</p>"


def create_jsonl(episode):
    """Creates a JSONL file for Prodigy
    with autoplay loop HTML pointing to the chunked mp3s"""

    ep_segments = sorted(Path("./syntax-clips/").glob(f"syntax{episode}*.mp3"))
    data = []
    for i, mp3_path in enumerate(ep_segments):
        html = html_template.format(mp3_path.name, episode, str(i).zfill(4))
        text = "E{0}:S{1}".format(episode, str(i).zfill(4))
        data.append(dict(html=html, text=text))

    with jsonlines.open(f'./prodigy-json/{episode}.jsonl', mode='w') as writer:
        writer.write_all(data)

In [3]:
episodes = list(sorted(Path("./episodes/").glob("*.mp3")))

In [None]:
for episode in episodes:
    ep_num = episode.stem[6:9]
    print("Loading", ep_num)
    ep_mono = pydub.AudioSegment.from_mp3(str(episode)).set_channels(1)
    print("Length (s):", len(ep_mono) / 1000)
    split_audio(ep_mono, ep_num)
    print("Split Complete")
    create_jsonl(ep_num)
    print("JSONL Created")