# Transcript ETL Code

## Preamble

In [6]:

from pandera.typing import DataFrame

import pandas
import json
from pathlib import Path



## Config

In [7]:
project_dir = Path("/Users/cls/Documents/Work/Projects/SoundOfAI/podcast-ai-lab/")
project_dir

PosixPath('/Users/cls/Documents/Work/Projects/SoundOfAI/podcast-ai-lab')

In [8]:
podcast_dir = project_dir / "data/podcasts/"
podcast_dir

PosixPath('/Users/cls/Documents/Work/Projects/SoundOfAI/podcast-ai-lab/data/podcasts')

In [20]:
transcript_dir = project_dir / "data/transcripts"
transcript_dir

PosixPath('/Users/cls/Documents/Work/Projects/SoundOfAI/podcast-ai-lab/data/transcripts')

In [13]:
ls {transcript_dir}

20220701 lex_ai_demis_hassabis.json  20220701 lex_ai_demis_hassabis.vtt


## Data Transforms

### Transcript to DataFrame

In [17]:
example_transcript_path = transcript_dir / "20220701 lex_ai_demis_hassabis.json"

In [15]:
def google_transcript_to_dataframe(
    path,
) -> DataFrame:
    """Parse Google Speech to Text API result .json -> pandas.DataFrame"""
    transcript_data = pandas.read_json(path)
    return transcript_data

In [22]:
def wav2vec_transcript_to_dataframe(
    path,
) -> DataFrame:
    """Parse Wav2Vec result .csv -> pandas.DataFrame"""
    transcript_data = pandas.read_csv(path)
    return transcript_data

## Examples

In [21]:
google_transcript_to_dataframe(
    transcript_dir / "Google" / "20220701 lex_ai_demis_hassabis.json"
)

Unnamed: 0,start,end,text,confidence,newpara
0,0.018,6.398,The following is a conversation with demouth c...,0.80,1.0
1,6.602,16.615,A company that has published and build some of...,0.88,
2,16.828,30.299,All by itself to play the game of Go better th...,0.90,
3,31.088,40.678,Thomas is widely considered to be one of the m...,0.80,1.0
4,40.873,50.563,This was truly an honor and a pleasure for me ...,0.88,
...,...,...,...,...,...
921,8172.342,8184.166,Human beings in this giant puzzle of ours and ...,0.81,
922,8184.954,8191.394,Thanks for listening to this conversation with...,0.81,1.0
923,8191.689,8195.580,And now let me leave you with some words from ...,0.91,1.0
924,8196.244,8202.197,Computer science is no more about computers an...,0.91,1.0


TODO:
- chunk enumeration
- file id / name

In [23]:
wav2vec_transcript_to_dataframe(
    transcript_dir / "wav2vec" / "20220701 lex_ai_demis_hassabis.csv"
)

Unnamed: 0,audio_file,chunk_num,transcription,begin_secs,end_secs
0,podcasts/lex_ai_demis_hassabis.mp3,0,THE FOLLOWING IS A CONVERSATION WITH DEMIS ASA...,0.000000,9.990687
1,podcasts/lex_ai_demis_hassabis.mp3,1,BAU ARTIFICIAL INTELLIGENT SYSTEMS IN THE HIST...,9.990687,19.981375
2,podcasts/lex_ai_demis_hassabis.mp3,2,UMAN IN THE WORLD AND ALFOFOLD TOO THAT SOLVED...,19.981375,29.972062
3,podcasts/lex_ai_demis_hassabis.mp3,3,DAMIS IS WIDELY CONSIDERED TO BE ONE OFTHE MOS...,29.972062,39.962750
4,podcasts/lex_ai_demis_hassabis.mp3,4,E GENERAL THIS WAS TRULY AN HONOR AND A PLEASU...,39.962750,49.953437
...,...,...,...,...,...
818,podcasts/lex_ai_demis_hassabis.mp3,818,HUMAN BEINGS IN THIS GIANT PUZZLE OF OURS AND ...,8172.374875,8182.365500
819,podcasts/lex_ai_demis_hassabis.mp3,819,GLAD WE REALLY ENJOYED IT THANKSLEKS THANKS FO...,8182.365500,8192.356125
820,podcasts/lex_ai_demis_hassabis.mp3,820,LET ME LEAVE YOU ASOME WORDS FROM EDSCER TYXST...,8192.356125,8202.346750
821,podcasts/lex_ai_demis_hassabis.mp3,821,THANK YOU FOR LISTENING AND HPE TO SEE YOU NEX...,8202.346750,8212.337375


In [24]:
google_transcript_to_dataframe(
    transcript_dir / "Google" / "20220701 lex_ai_demis_hassabis.json"
).to_json()

'{"start":{"0":0.018,"1":6.602,"2":16.828,"3":31.088,"4":40.873,"5":51.982,"6":64.018,"7":72.498,"8":79.736,"9":93.861,"10":99.991,"11":106.338,"12":124.954,"13":130.086,"14":137.297,"15":151.457,"16":159.424,"17":168.354,"18":181.489,"19":185.216,"20":194.497,"21":203.346,"22":207.478,"23":215.364,"24":223.448,"25":232.865,"26":241.273,"27":247.538,"28":254.137,"29":263.589,"30":274.257,"31":282.602,"32":297.879,"33":308.268,"34":322.554,"35":334.257,"36":345.015,"37":355.034,"38":364.28,"39":373.129,"40":381.438,"41":388.541,"42":393.978,"43":399.677,"44":411.911,"45":422.32,"46":436.244,"47":446.101,"48":454.645,"49":460.955,"50":468.301,"51":476.826,"52":484.892,"53":489.807,"54":499.152,"55":507.299,"56":513.15,"57":528.733,"58":538.645,"59":542.984,"60":552.49,"61":554.273,"62":564.31,"63":566.21,"64":577.615,"65":591.749,"66":599.068,"67":607.332,"68":616.136,"69":621.439,"70":629.117,"71":639.614,"72":642.891,"73":652.847,"74":662.561,"75":673.886,"76":680.889,"77":691.449,"78"