# Transcript ETL Code

## Preamble

In [1]:

from pandera.typing import DataFrame

import pandas
import json
from pathlib import Path



## Config

In [2]:
project_dir = Path("/Users/cls/Documents/Work/Projects/SoundOfAI/podcast-ai-lab/")
project_dir

PosixPath('/Users/cls/Documents/Work/Projects/SoundOfAI/podcast-ai-lab')

In [3]:
podcast_dir = project_dir / "data/podcasts/"
podcast_dir

PosixPath('/Users/cls/Documents/Work/Projects/SoundOfAI/podcast-ai-lab/data/podcasts')

In [4]:
transcript_dir = project_dir / "data/transcripts/Google"
transcript_dir

PosixPath('/Users/cls/Documents/Work/Projects/SoundOfAI/podcast-ai-lab/data/transcripts/Google')

In [5]:
ls {transcript_dir}

20220617 lex_ai_richard_wolff.json   20220628 lex_ai_susan_cain.vtt
20220617 lex_ai_richard_wolff.vtt    20220701 lex_ai_demis_hassabis.json
20220628 lex_ai_susan_cain.json      20220701 lex_ai_demis_hassabis.vtt


## Data Transforms

### Transcript to DataFrame

In [6]:
example_transcript_path = transcript_dir / "20220701 lex_ai_demis_hassabis.json"

In [7]:
def google_transcript_to_dataframe(
    path,
) -> DataFrame:
    """Parse Google Speech to Text API result .json -> pandas.DataFrame"""
    transcript_data = pandas.read_json(path)
    return transcript_data

In [8]:
def wav2vec_transcript_to_dataframe(
    path,
) -> DataFrame:
    """Parse Wav2Vec result .csv -> pandas.DataFrame"""
    transcript_data = pandas.read_csv(path)
    return transcript_data

In [9]:
def parse_google_transcript(
    path,
    episode_id: int = None,
) -> DataFrame:
    """Parse Google Speech to Text API result .json -> pandas.DataFrame"""
    transcript_data = pandas.read_json(path)
    #transcript_data["file_name"] = path.stem
    transcript_data["episode_id"] = episode_id
    return transcript_data

In [10]:
def clean_google_transcript(
    transcript_data,
) -> DataFrame:
    """Clean Google Speech to Text API result."""
    transcript_data = (
        transcript_data
        .rename(
            columns={
                "start": "chunk_start",
                "end": "chunk_end",
                "newpara": "new_paragraph"
            }
        )
    )
    transcript_data["new_paragraph"] = transcript_data["new_paragraph"].fillna(0.0).astype(bool)
    transcript_data["chunk_number"] = transcript_data.index
    # enumerate paragraphs
    transcript_data["paragraph_number"] = transcript_data["new_paragraph"].cumsum()
    transcript_data = transcript_data.drop(columns=["new_paragraph"])
    return transcript_data


In [11]:
def process_transcripts(
    transcript_dir,
):
    """Process all transcripts in a directory."""
    transcript_paths = [
        p for p in transcript_dir.glob("*.json")
    ]
    transcript_data = [
        clean_google_transcript(parse_google_transcript(path, episode_id)) for episode_id, path in enumerate(transcript_paths)
    ]
    episode_data = pandas.DataFrame(
        [
            {"episode_id": episode_id, "file_name": path.stem}
            for episode_id, path in enumerate(transcript_paths)
        ]
    )
    transcript_data = pandas.concat(transcript_data)
    transcript_data = transcript_data.set_index(["episode_id", "chunk_number"])
    episode_data = episode_data.set_index("episode_id")
    return transcript_data, episode_data

## Examples

TODO:
- chunk enumeration
- file id / name

In [12]:
transcript_data = clean_google_transcript(
    parse_google_transcript(
        transcript_dir / "20220701 lex_ai_demis_hassabis.json"
    )
)
transcript_data

Unnamed: 0,chunk_start,chunk_end,text,confidence,episode_id,chunk_number,paragraph_number
0,0.018,6.398,The following is a conversation with demouth c...,0.80,,0,1
1,6.602,16.615,A company that has published and build some of...,0.88,,1,1
2,16.828,30.299,All by itself to play the game of Go better th...,0.90,,2,1
3,31.088,40.678,Thomas is widely considered to be one of the m...,0.80,,3,2
4,40.873,50.563,This was truly an honor and a pleasure for me ...,0.88,,4,2
...,...,...,...,...,...,...,...
921,8172.342,8184.166,Human beings in this giant puzzle of ours and ...,0.81,,921,203
922,8184.954,8191.394,Thanks for listening to this conversation with...,0.81,,922,204
923,8191.689,8195.580,And now let me leave you with some words from ...,0.91,,923,205
924,8196.244,8202.197,Computer science is no more about computers an...,0.91,,924,206


In [13]:
transcript_data, episode_data = process_transcripts(
    transcript_dir=transcript_dir,
)

In [14]:
episode_data

Unnamed: 0_level_0,file_name
episode_id,Unnamed: 1_level_1
0,20220701 lex_ai_demis_hassabis
1,20220617 lex_ai_richard_wolff
2,20220628 lex_ai_susan_cain


In [15]:
transcript_data

Unnamed: 0_level_0,Unnamed: 1_level_0,chunk_start,chunk_end,text,confidence,paragraph_number
episode_id,chunk_number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0.018,6.398,The following is a conversation with demouth c...,0.80,1
0,1,6.602,16.615,A company that has published and build some of...,0.88,1
0,2,16.828,30.299,All by itself to play the game of Go better th...,0.90,1
0,3,31.088,40.678,Thomas is widely considered to be one of the m...,0.80,2
0,4,40.873,50.563,This was truly an honor and a pleasure for me ...,0.88,2
...,...,...,...,...,...,...
2,864,7495.120,7503.117,If you'll accept really strong emotions someti...,0.90,178
2,865,7503.708,7511.174,Highly sensitive people also process informati...,0.91,178
2,866,7511.783,7521.274,They tend to notice so these that others miss ...,0.81,178
2,867,7522.478,7525.730,Thank you for listening and hope to see you ne...,0.85,179


In [21]:
def load_knowledge_base(
    knowledge_base_dir,
):
    """Load knowledge base."""
    knowledge_base_dir = Path(knowledge_base_dir)
    transcript_data = pandas.read_parquet(knowledge_base_dir / "transcript_data.parquet")
    episode_data = pandas.read_parquet(knowledge_base_dir / "episode_data.parquet")

    return {
        "transcript_data": transcript_data, 
        "episode_data": episode_data,
    }

In [22]:
from engine import config

ModuleNotFoundError: No module named 'engine'

In [26]:
knowledge_base = load_knowledge_base(
    knowledge_base_dir="../../data/knowledge_base/"
)

In [28]:
knowledge_base["transcript_data"]

Unnamed: 0_level_0,Unnamed: 1_level_0,chunk_start,chunk_end,text,confidence,paragraph_number
episode_id,chunk_number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0.018,6.398,The following is a conversation with demouth c...,0.80,1
0,1,6.602,16.615,A company that has published and build some of...,0.88,1
0,2,16.828,30.299,All by itself to play the game of Go better th...,0.90,1
0,3,31.088,40.678,Thomas is widely considered to be one of the m...,0.80,2
0,4,40.873,50.563,This was truly an honor and a pleasure for me ...,0.88,2
...,...,...,...,...,...,...
3,1192,10258.852,10267.146,it was a pleasure\nthanks for listening to thi...,0.80,244
3,1193,10267.620,10271.611,And now let me leave you with some words from ...,0.91,244
3,1194,10272.518,10279.659,Your assumptions are your windows on the world...,0.88,245
3,1195,10280.683,10283.674,Thank you for listening and hope to see you ne...,0.87,246


In [29]:
knowledge_base["episode_data"]

Unnamed: 0_level_0,file_name
episode_id,Unnamed: 1_level_1
0,20220701 lex_ai_demis_hassabis
1,20220617 lex_ai_richard_wolff
2,20220628 lex_ai_susan_cain
3,20220122 lex_ai_yann_lecun_2
