## Use audio fragment for prototyping

In [None]:
from pydub import AudioSegment
import os

# Set the paths to your ffmpeg and ffprobe executables
# AudioSegment.ffmpeg = "/opt/homebrew/bin/ffmpeg"
# AudioSegment.ffprobe = "/opt/homebrew/bin/ffprobe"
os.environ["PATH"] += f"{os.pathsep}/opt/homebrew/bin"

# Load the input MP3 file
input_file = "./podcasts/the_home_run/How to prepare yourself to bid at an auction.mp3"
output_file = "./podcasts/dev.mp3"
audio = AudioSegment.from_mp3(input_file)

# Slice the first minute (60,000 milliseconds)
first_minute = audio[:60_000]

# Export the sliced audio as a new MP3 file
first_minute.export(output_file, format="mp3")

## Ingest

In [4]:
import pandas as pd
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import VectorDBQA
from langchain.document_loaders import TextLoader

from langchain.document_loaders import DataFrameLoader
import os

transcript_col = 'text'
embedding_function = OpenAIEmbeddings(openai_api_key=os.environ['OPEN_API_KEY'])
llm = OpenAI(openai_api_key=os.environ['OPEN_API_KEY'])
vectordb_persist_dir = 'db'
vectordb = Chroma(embedding_function=embedding_function, persist_directory=vectordb_persist_dir)
merge_threshold = 2

def merge_adjacent_utterances(df):
    # Merge records
    merged_records = []
    for _, row in df.iterrows():
        if row['merge']:
            # Merge text with the next recordb
            row['text'] += df.loc[_, 'text']
            row['end'] = df.loc[_, 'end']
            # Remove the next record from the dataframe
            df.drop(_, inplace=True)
        merged_records.append(row)
    return pd.DataFrame(merged_records).drop(columns=['delta', 'merge']).reset_index(drop=True)

def parse_transcript(transcript_file):
    dff = (pd.read_csv(transcript_file)
    # round, calculate deltas
    .assign(start=lambda x: round(x.start, 2))
    .assign(end=lambda x: round(x.end, 2))
    .assign(delta=lambda x: x.start.shift(-1) - x.end)
    .assign(merge=lambda x: x.delta > merge_threshold)
    )
    return merge_adjacent_utterances(dff)

def estimate_cost_of_ingest(transcript_df):
    ada_cost_per_1000_tokens = 0.0004
    n_tokens = len([e for e in ''.join(transcript_df.text.tolist()).split(' ') if e])
    print(f"{n_tokens} tokens found in transcript")
    cost_estimate = (n_tokens / 1000) * ada_cost_per_1000_tokens
    print(f"Estimate ingestion cost: US ${cost_estimate}")

def ingest_transcript_df(transcript_df):
    # TODO: add upsert, prevent redundant writes
    documents = DataFrameLoader(transcript_df.head(20), page_content_column=transcript_col).load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)
    # embed, write to chroma, save chroma db
    vectordb.add_documents(texts)
    vectordb.persist()

Using embedded DuckDB with persistence: data will be stored in: db


In [57]:
from transcribe import transcript_dir

for transcript in list(transcript_dir.rglob('*/*.csv')):
    transcript_df = parse_transcript(transcript).assign(podcast=transcript.parent.name)
    print(f"Ingesting transcript: {transcript.name}")
    estimate_cost_of_ingest(transcript_df)
    ingest_transcript_df(transcript_df)

Ingesting transcript: 517_tips_for_choosing_a_builder.csv
3226 tokens found in transcript
Estimate ingestion cost: US $0.0012904000000000001
3226 tokens found in transcript
Estimate ingestion cost: US $0.0012904000000000001
Ingesting transcript: australia's_richest_postcodes,_retreat_from_the_regions_&_new_housing_stimulus.csv
6265 tokens found in transcript
Estimate ingestion cost: US $0.002506
6265 tokens found in transcript
Estimate ingestion cost: US $0.002506
Ingesting transcript: 360._new_cpd_requirements_for_strata_professionals.csv
3483 tokens found in transcript
Estimate ingestion cost: US $0.0013932
3483 tokens found in transcript
Estimate ingestion cost: US $0.0013932
Ingesting transcript: the_art_of_persuasion__how_to_get_what_you_want_in_property_deals.csv
12225 tokens found in transcript
Estimate ingestion cost: US $0.00489
12225 tokens found in transcript
Estimate ingestion cost: US $0.00489
Ingesting transcript: buying_a_home_for_the_future__sustainability_and_long-term

In [7]:
qa = VectorDBQA.from_chain_type(llm=llm, chain_type="stuff", vectorstore=vectordb)
query = "Is this podcast useful for me if I want to buy my first unit, I am a first home buyer"
qa.run(query)

' Yes, this podcast is useful for you if you want to buy your first unit, as it provides tips and tools for first home buyers.'