In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from IPython.core.interactiveshell import InteractiveShell
from mutagen.mp3 import MP3
from tqdm import tqdm

import re
import ftfy


def remove_excess_char(
    input_string: str,
) -> str:
    # new lines
    text = re.sub("[\n]{2,}", "\n", input_string)

    # tabs
    text = re.sub("[\t]{2,}", "\t", text)

    # carriage returns
    text = re.sub("[\r]{2,}", "\r", text)

    # vertical tabs
    text = re.sub("[\v]{2,}", "\v", text)

    # n-repetitive spaces
    for n in range(2, 10)[::-1]:
        text = text.replace(" " * n, " ")

    return text


def remove_html_tags(string: str) -> str:
    soup = BeautifulSoup(string, "html.parser")
    return soup.get_text(separator=" ")

def remove_malformed_utf8(string: str) -> str:
    return ftfy.fix_text(string)

In [114]:
import logging
pod_scrape_logger = logging.getLogger("pod_scrape_logger")
pod_scrape_logger.setLevel(logging.INFO)

In [None]:
# consolidate trove podcasts
trove_a = pd.read_csv(
    "/home/samhardyhey/experiment_artefacts/stt_training/misc_data/radio_national_trove_results.csv"
)

trove_b = pd.read_csv(
    "/home/samhardyhey/experiment_artefacts/stt_training/misc_data/radio_national_trove_results_v2.csv"
)

trove_all = (
    pd.concat([trove_a, trove_b])
    .drop_duplicates("id")
    .pipe(lambda x: x[x.abc_site_link.str.contains("/programs/")])
)

# consolidate all URLS
# all_podcast_urls = list(set(most_recent + trove_all.abc_site_link.tolist()))


## Audio/transcript retrieval
- Retrieve 100 most recent podcasts from RN page
- Use additional podcasts retrieved via Trove
- Consolidate/filter web addreses, retrieve audio and transcripts

In [None]:
def get_abc_podcast_transcript_and_mp3(podcast_url, audio_dir, transcript_dir):
    # throttle prevention?
    #     sleep(0.05)

    file_name = (
        podcast_url.split("programs/")[1]
        .replace("/", "_")
        .replace(" ", "-")
        .replace(",", "")
    )

    if (audio_dir / f"{file_name}.mp3").exists() or (
        audio_dir / transcript_dir / f"{file_name}.txt"
    ).exists():
        #         print(f"Audio/transcript already exists for: {podcast_url}")
        pass
    else:
        page = requests.get(podcast_url)
        soup = BeautifulSoup(page.content, "html.parser")

        # attempt to retrieve audio
        try:
            results = soup.find(id="comp-audio-player5")
            mp3_url = results.find_all("a")[0]["href"]
            # download and save mp3
            doc = requests.get(mp3_url)
            with open(audio_dir / f"{file_name}.mp3", "wb") as f:
                f.write(doc.content)
        except:
            pass
        #             print(f"Unable to retrieve audio for: {podcast_url}")

        # attempt to retrieve transcript
        try:
            results = soup.find(
                "div", class_="view-transcript comp-accordion-wrapper"
            ).find("div", class_="comp-rich-text clearfix")
            transcript = results.get_text(separator="\n")
            with open(transcript_dir / f"{file_name}.txt", "w") as f:
                f.write(transcript)
        except:
            pass
#             print(f"Unable to retrieve transcript for: {podcast_url}")


In [None]:
from urllib.parse import urlparse, urlsplit
from pathlib import Path

import shutil

output_dir = Path("../output/abc_podcasts")
shutil.rmtree(str(output_dir)) if output_dir.exists() else None
output_dir.mkdir(parents=True)

# get most recent podcasts from RN website
url = "https:/www.abc.net.au/radionational/transcripts/"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")

transcript_page_urls = []
for a in soup.find_all("a", href=True):
    if "/radionational/programs" in a["href"] and len(Path(a["href"]).parts) > 3:
        transcript_page_urls.append(f"https:/www.abc.net.au{a['href']}")


In [119]:
r = requests.get(transcript_page_urls[0])
soup = BeautifulSoup(r.content, "html.parser")

def get_podcast_mp3_link(page_soup):
    audio_elements = page_soup.find('audio')
    mp3_candidate_links = [e['src'] for e in audio_elements]

    if len(mp3_candidate_links) > 1:
        pod_scrape_logger.warning("More than 1 candidate mp3 URL found")
    else:
        return mp3_candidate_links[0]

def download_podcast_mp3(mp3_url, audio_dir, file_name):
    doc = requests.get(mp3_url)
    with open(audio_dir / f"{file_name}.mp3", "wb") as f:
        f.write(doc.content)

def get_podcast_transcript(page_soup):
    results = page_soup.find(id='transcript')
    return results.get_text(separator="\n")

mp3_link = get_podcast_mp3_link(soup)
transcript_rough = get_podcast_transcript(soup)

download_podcast_mp3(mp3_link, output_dir, Path(transcript_page_urls[0]).parents[0].name)


In [None]:
soup

In [None]:
audio_dir = Path(
    "/home/samhardyhey/experiment_artefacts/stt_training/exp_a/sh_abc_podcasts/audio"
)
transcript_dir = Path(
    "/home/samhardyhey/experiment_artefacts/stt_training/exp_a/sh_abc_podcasts/transcripts"
)

audio_dir.mkdir(parents=True, exist_ok=True) if audio_dir.exists() == False else None
transcript_dir.mkdir(
    parents=True, exist_ok=True
) if transcript_dir.exists() == False else None


In [None]:
%%time
for podcast_url in tqdm(all_podcast_urls):
    try:
        get_abc_podcast_transcript_and_mp3(
            podcast_url, audio_dir, transcript_dir)
    except:
        print(f"Could not download page contents for: {podcast_url}")

## Post-processing
- Remove audio/transcripts which are missing either an audio/transcript file (incomplete)
- Clean transcripts, remove speaker tags
- Remove podcasts with slow/fast WPM/speaker rates
- Remove very short/long podcasts

In [None]:
# get intersection
all_audio = set([e.stem for e in audio_dir.rglob("./*.mp3")])
all_transcript = set([e.stem for e in transcript_dir.rglob("./*.txt")])
complete_transcripts = all_audio.intersection(all_transcript)

# remove files not in intersection
for e in audio_dir.rglob("./*.mp3"):
    if e.stem not in complete_transcripts:
        os.remove(e.as_posix())

# remove files not in intersection
for e in transcript_dir.rglob("./*.txt"):
    if e.stem not in complete_transcripts:
        os.remove(e.as_posix())


In [None]:
transcript_processed_dir = Path(
    "/home/samhardyhey/experiment_artefacts/stt_training/exp_a/sh_abc_podcasts/transcripts_processed"
)
transcript_processed_dir.mkdir(
    parents=True, exist_ok=True
) if transcript_processed_dir.exists() == False else None


In [None]:
def remove_speaker_tags(text):
    # v2, iterate through each line in the transcript, discard and strip => rejoin
    filtered = []
    for e in text.replace("\n:", ":\n").split("\n"):
        # weirdo newline characters
        e = e.strip()

        # if colon in first 20 char of each new line, split on colon (speaker tag, probably)
        if ":" in e[:20]:
            e = e.split(":")[1].strip()

        # remove production audio overlay brackets/parens
        if "[" in e:
            e = re.sub("\[(.*?)\]", "", e)

        if "(" in e:
            e = re.sub("\(.*?\)", "", e)

        # additionally, any excessive characters
        e = remove_excess_char(e)

        if len(e) == 0:
            # zero length
            continue

        if e.endswith(":"):
            # probably a speaker utterance mark
            continue

        filtered.append(e)

    return " ".join(filtered)  # concat


In [None]:
# transcript post-processing debugging
# import random

# from srsly import read_jsonl

# train_manifest = (pd.DataFrame(
#     list(read_jsonl('/home/samhardyhey/experiment_artefacts/stt_training/exp_b/data/output/train_manifest.json')))
#                  .assign(stem=lambda x: x.audio_filepath.apply(lambda y: Path(y).stem))
#                   .pipe(lambda x: x[x.text_no_preprocessing.str.contains('Craig')])
#                  )

# with open('/home/samhardyhey/experiment_artefacts/stt_training/exp_b/data/output/manifests/', 'r') as f:
#     text = f.read()

# for e in random.sample(list(transcript_dir.rglob('./*.txt')), 10):
# #     if 'breakfast_economists-warning-to-the-rba-dont-raise-interest_2961146' in e.stem:
#     with open(e, 'r') as f:
#         text = f.read()
#         print()
#         print()
#         print(text[:400])
#         print()
#         removed_tags = remove_speaker_tags(text)
#         print(removed_tags[:400])


In [None]:
for e in transcript_dir.rglob("./*.txt"):
    # open, strip
    with open(e, "r") as f:
        text = f.read()
    removed_tags = remove_speaker_tags(text)

    # save
    with open(transcript_processed_dir / f"{e.stem}.txt", "w") as f:
        res = f.write(removed_tags)


In [None]:
# additionally, remove very/short podcasts with disproportionate WPM counts
transcript_records = []
for audio, transcript in zip(
    sorted(list(audio_dir.rglob("./*.mp3"))),
    sorted(list(transcript_processed_dir.rglob("./*.txt"))),
):
    assert audio.stem == transcript.stem

    with open(transcript, "r") as f:
        transcript_text = f.read()

    transcript_records.append(
        {
            "transcript": transcript_text,
            "audio_len": MP3(audio).info.length,
            "audio_path": audio,
            "transcript_path": transcript,
        }
    )


In [None]:
# Average WPM/speaking rates - https:/virtualspeech.com/blog/average-speaking-rate-words-per-minute
# Presentations: between 100 - 150 wpm for a comfortable pace
# Conversational: between 120 - 150 wpm
# Audiobooks: between 150 - 160 wpm, which is the upper range that people comfortably hear and vocalise words
# Radio hosts and podcasters: between 150 - 160 wpm
# Auctioneers: can speak at about 250 wpm
# Commentators: between 250- 400 wpm


In [None]:
filtered_podcast_meta = (
    pd.DataFrame(transcript_records)
    .assign(stem=lambda x: x.audio_path.apply(lambda y: y.stem))
    .assign(transcript_len=lambda x: x.transcript.apply(lambda y: len(y.split(" "))))
    # enforce podcast min/max length
    .assign(audio_len_min=lambda x: x.audio_len.apply(lambda y: y / 60))
    .query("audio_len_min >= 5 & audio_len_min <= 30")
    # enforce average floor/ceiling on WPM rates
    .assign(
        wpm=lambda x: x.apply(lambda y: y.transcript_len / (y.audio_len / 60), axis=1)
    )
    .query("wpm >= 120 & wpm <= 180")
)


In [None]:
with open(filtered_podcast_meta.iloc[999].transcript_path, "r") as f:
    text = f.read()


In [None]:
(
    filtered_podcast_meta.pipe(
        lambda x: x[["audio_len", "transcript_len", "audio_len_min", "wpm"]]
    ).describe()
)


In [None]:
len(list(audio_dir.rglob("./*.mp3")))
len(list(transcript_dir.rglob("./*.txt")))
len(list(transcript_processed_dir.rglob("./*.txt")))


In [None]:
# remove files not within final query
for e in audio_dir.rglob("./*.mp3"):
    if e.stem not in set(filtered_podcast_meta.stem):
        os.remove(e.as_posix())

for e in transcript_dir.rglob("./*.txt"):
    if e.stem not in set(filtered_podcast_meta.stem):
        os.remove(e.as_posix())

for e in transcript_processed_dir.rglob("./*.txt"):
    if e.stem not in set(filtered_podcast_meta.stem):
        os.remove(e.as_posix())


In [None]:
# sanity check
set([e.stem for e in audio_dir.rglob("./*.mp3")]) == set(
    [e.stem for e in transcript_processed_dir.rglob("./*.txt")]
)


## Prior meta analysis

In [None]:
hrs = duration / 3600
mins = (duration - (hrs * 3600)) / 60
seconds = duration - (hrs * 3600) - (mins * 60)

f"{int(hrs)}:{int(mins)}:{seconds}"


In [None]:
mp3s = glob(
    "/home/rianne/-data/experiment_artefacts/stt_training/exp_b/abc_podcasts/**/**/*.mp3"
)


In [None]:
duration = []
for mp3 in mp3s:
    audio = MP3(mp3).info.length
    duration.append(audio.info.length)


In [None]:
def cal_transcript_len(mp3_loc):
    text_loc = mp3_loc.replace("/audio/", "/text/").replace(".mp3", ".txt")

    with open(text_loc) as f:
        lines = f.read()
    return len(lines.split())


In [None]:
df = pd.DataFrame(mp3s, columns=["mp3"])


In [None]:
df["n_words"] = df["mp3"].apply(cal_transcript_len)


In [None]:
df["audio_len"] = df[0].apply(lambda x: MP3(x).info.length)


In [None]:
small_df = df.query("audio_len < 2000").query("n_words < 7000")
small_df = small_df[~((small_df.audio_len < 400) & (small_df.n_words > 2200))]
small_df = small_df[~((small_df.audio_len > 1500) & (small_df.n_words < 500))]

small_df.plot.scatter(x="n_words", y="audio_len", c="DarkBlue")


In [None]:
duration = small_df.audio_len.sum()

hrs = duration / 3600
mins = (duration - (hrs * 3600)) / 60
seconds = duration - (hrs * 3600) - (mins * 60)

f"{int(hrs)}:{int(mins)}:{seconds}"


# Remove outliers & long podcasts

In [None]:
all_files = {Path(f).stem for f in df[0].tolist()}
keep_files = {Path(f).stem for f in small_df[0].tolist()}


In [None]:
remove_files = all_files.difference(keep_files)


In [None]:
import os

incorrect_files = list(remove_files)

for file in incorrect_files:
    try:
        os.remove(
            f"/home/rianne/-data/experiment_artefacts/stt_training/exp_a/abc_podcasts/transformed/text_with_speaker_tags/{file}.txt"
        )
        os.remove(
            f"/home/rianne/-data/experiment_artefacts/stt_training/exp_a/abc_podcasts/transformed/text_no_speaker_tags/{file}.txt"
        )
        os.remove(
            f"/home/rianne/-data/experiment_artefacts/stt_training/exp_a/abc_podcasts/transformed/audio/{file}.mp3"
        )
    except:
        print(f"Could not remove file {file}")
