In [1]:
import json
import os

# download ffmpeg (https://www.ffmpeg.org/download.html) and set the path
ffmpeg_path = r"C:\Users\Sudheera\Documents\ffmpeg-n6.1-latest-win64-gpl-6.1\ffmpeg-n6.1-latest-win64-gpl-6.1\bin"
os.environ['PATH'] += os.pathsep + ffmpeg_path

with open('vars.json') as f:
    data = json.load(f)

## 1. Download subtitles from YouTube

In [2]:
import yt_dlp


# Function to download subtitles
def download_subtitles(url, lang='en'):
    ydl_opts = {
        'writesubtitles': True,  # Download subtitles
        'writeautomaticsub': True,  # Download automatic subtitles generated by YouTube
        'subtitleslangs': [lang],  # Subtitle language (default is English)
        'subtitlesformat': 'srt',  # Format for subtitles (e.g., srt, vtt)
        'skip_download': True,  # Do not download the video file itself
        'restrictfilenames': True,  # Do not allow special characters in file names
        'outtmpl': '%(title)s.%(ext)s',  # Output file naming
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

In [3]:
# URL of the YouTube video
video_url = 'https://www.youtube.com/watch?v=CDZ9REOh2xA'

# Download subtitle
download_subtitles(video_url)

[youtube] Extracting URL: https://www.youtube.com/watch?v=CDZ9REOh2xA
[youtube] CDZ9REOh2xA: Downloading webpage
[youtube] CDZ9REOh2xA: Downloading ios player API JSON
[youtube] CDZ9REOh2xA: Downloading web creator player API JSON
[youtube] CDZ9REOh2xA: Downloading m3u8 information
[info] CDZ9REOh2xA: Downloading subtitles: en




[info] CDZ9REOh2xA: Downloading 1 format(s): 616+251
[info] Writing video subtitles to: Elon_Musk_s_approach_to_problem-solving_Lex_Fridman_Podcast.en.vtt
[download] Destination: Elon_Musk_s_approach_to_problem-solving_Lex_Fridman_Podcast.en.vtt
[download] 100% of   68.77KiB in 00:00:00 at 76.26KiB/s


## 2. Download audio file from Youtube and transcribe it using Google Speech to Text

In [6]:
import yt_dlp


def download_audio(youtube_url, output_file):
    ydl_opts = {
        'format': 'bestaudio/best',  # Select the best available audio quality
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3'  # You can change this to 'wav', 'm4a', etc.
        }],
        'outtmpl': output_file,  # Output file name and path
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])

    return True

In [8]:
# Download Audio
download_audio(video_url, 'audio_new')

[youtube] Extracting URL: https://www.youtube.com/watch?v=CDZ9REOh2xA
[youtube] CDZ9REOh2xA: Downloading webpage
[youtube] CDZ9REOh2xA: Downloading ios player API JSON
[youtube] CDZ9REOh2xA: Downloading web creator player API JSON
[youtube] CDZ9REOh2xA: Downloading m3u8 information
[info] CDZ9REOh2xA: Downloading 1 format(s): 251
[download] Destination: audio_new
[download] 100% of    6.53MiB in 00:00:07 at 928.98KiB/s 
[ExtractAudio] Destination: audio_new.mp3
Deleting original file audio_new (pass -k to keep)


True

In [9]:
audio_file_path = r"audio_new.mp3"

import io
from pydub import AudioSegment

# Read the audio file into memory using io.BytesIO
with open(audio_file_path, 'rb') as audio_file:
    audio_data = io.BytesIO(audio_file.read())

    # Load the audio from the in-memory file
    audio = AudioSegment.from_file(audio_data, format="mp3")

    # Print the sampling rate
    print(f"Sampling Rate: {audio.frame_rate} Hz")

Sampling Rate: 48000 Hz


## 3. Transcribe the audio using Google Speech to Text

## 4. Pre-process downloaded transcript

In [1]:
# After transcribing an audio file you can download a csv file and a Json file from Google Speech-to-text
# csv :- transcript, confidence, speaker tag, language code, start time, end time (some words can be wrong)
# json :- Additional information you have here is the confidence of each word

# first we will read files separately and create one dataframe with necessary data 

import pandas as pd
import json

csv_file = 'audio.csv'
json_file = 'audio.json'

csv_column_names = {
    'Start time': 'start_time',
    'End time': 'end_time',
    'Language code': 'language_code',
    'Confidence': 'confidence',
    'Channel': 'channel',
    'Speaker tag': 'speaker_tag',
    'Transcript': 'transcript'
}

# Json file format :- 
# {
#    "results": [
#        { "alternatives": [
#               {
#                   "confidence": _
#                   "transcript": _
#                   "words": [
#                       {
#                           "confidence": _,
#                           "endTime": _,
#                           "startTime": _,
#                           "word": _
#                       }, ...
#           ],
#           "languageCode": _,
#           "resultEndTime": _
#       }, ...
#       ...
#       ...
#       {                                           <- last item of the list (after all transcriptions)
#           "alternatives": {
#               "words": [
#                   {
#                      "confidence": _,
#                      "endTime": _,
#                      "speakerLabel": _,
#                      "speakerTag": _,
#                      "startTime": _,
#                      "word": _
#                   },....                          <- one item for each word
#               ]
#           }
#       }
#   ]
# }

In [2]:
# Read the csv file
import hashlib


def get_sha256_hash(input_string):
    encoded_string = input_string.encode()
    sha256_hash = hashlib.sha256(encoded_string)
    return sha256_hash.hexdigest()


transcript_df = pd.read_csv(csv_file)
transcript_df.rename(columns=csv_column_names, inplace=True)
transcript_df.drop(columns=['channel'], inplace=True)
transcript_df['hashed_transcript'] = transcript_df['transcript'].apply(get_sha256_hash)
transcript_df

Unnamed: 0,start_time,end_time,language_code,confidence,speaker_tag,transcript,hashed_transcript
0,2.8,14.5,en-us,0.96046,1,Can you just speak to what it takes for great ...,a5350f3f9dfe18a3401a06f9972b31bab70158e3bed551...
1,15.4,18.1,en-us,0.84458,1,process of the process of constantly improvin...,7765622ac3bfe9d45ebd6d1418c06e5f34fe2726ad6190...
2,22.9,29.3,en-us,0.82285,2,"Well, these are to say simplify as vertical t...",f244266073005f037d09105a548175753ed8d105419285...
3,32.7,41.8,en-us,0.88606,2,You know how this very basic first basic firs...,6cdc50f14fc9676f34f8dcb7ce5fec772d6e77a844236e...
4,44.3,48.9,en-us,0.72278,2,The requirements always down to some degree. ...,ac34c4be5f95eb462a15976a7445f3c91c2dd8b24e586f...
...,...,...,...,...,...,...,...
86,489.4,497.2,en-us,0.83681,2,"look at the gray matter, which is the compute...",3c0ab2fae4413f984ce660e62cc68cd594907607e88449...
87,497.2,502.1,en-us,0.83681,1,like walking around in the super computer cen...,20078ff5635fc66c05669d581f7e12cdce2e95dbabe8de...
88,502.8,503.1,en-us,0.53844,1,"Yeah, well.",63ba1f77d332ea3b44f03b603d5061b69e9b8a599c1278...
89,504.1,505.5,en-us,0.90823,1,one day build a super intelligent,66f6b248588f96d27582131f394fde8035d9d5916b3571...


In [3]:
print(f"number of transcripts :- {transcript_df.shape[0]}")

number of transcripts :- 91


In [4]:
# Load data from JSON file
with open(json_file, 'r') as f:
    transcript_json = json.load(f)

print(len(transcript_json['results']))

76


In [5]:
transcript_json_df_cols = ['transcript', 'less_confident_words']

# less_confident_words = [{
#    "word": _,
#    "confidence": _
#    "index_in_transcript": _
#    }, ...]

transcript_json_df = pd.DataFrame(columns=transcript_json_df_cols)


def _get_less_confident_words_(words):
    less_confident_words = []
    for idx, word in enumerate(words):
        if word['confidence'] < 0.8:
            less_confident_words.append({
                'word': word['word'],
                'index_in_transcript': idx
            })
    return less_confident_words


for itm in transcript_json['results'][:-1]:
    transcript_json_row_data = {'transcript': [itm['alternatives'][0]['transcript']],
                                'less_confident_words': str(_get_less_confident_words_(itm['alternatives'][0]['words']))
                                }
    transcript_json_df = pd.concat([transcript_json_df, pd.DataFrame.from_dict(transcript_json_row_data)],
                                   ignore_index=True)

transcript_json_df['hashed_transcript'] = transcript_json_df['transcript'].apply(get_sha256_hash)
transcript_json_df

Unnamed: 0,transcript,less_confident_words,hashed_transcript
0,Can you just speak to what it takes for great ...,"[{'word': 'the', 'index_in_transcript': 19}, {...",a5350f3f9dfe18a3401a06f9972b31bab70158e3bed551...
1,process of the process of constantly improvin...,"[{'word': 'of', 'index_in_transcript': 1}, {'w...",7765622ac3bfe9d45ebd6d1418c06e5f34fe2726ad6190...
2,"Well, these are to say simplify as vertical t...","[{'word': 'these', 'index_in_transcript': 1}, ...",f244266073005f037d09105a548175753ed8d105419285...
3,You know how this very basic first basic firs...,"[{'word': 'how', 'index_in_transcript': 2}, {'...",6cdc50f14fc9676f34f8dcb7ce5fec772d6e77a844236e...
4,The requirements always down to some degree. ...,"[{'word': 'The', 'index_in_transcript': 0}, {'...",ac34c4be5f95eb462a15976a7445f3c91c2dd8b24e586f...
...,...,...,...
70,"It looks pretty cool. Yeah, it's like it's li...","[{'word': 'the', 'index_in_transcript': 9}, {'...",ee14698c0a54d64462b3858ada257156ecc0139bd16ea8...
71,brain tissue is the cables. Yeah. So look at ...,"[{'word': 'brain', 'index_in_transcript': 0}, ...",156fcdf6552db47fc22afdeb8b752420a1c3407f53aa55...
72,"Yeah, well.","[{'word': 'Yeah,', 'index_in_transcript': 0}, ...",63ba1f77d332ea3b44f03b603d5061b69e9b8a599c1278...
73,one day build a super intelligent,[],66f6b248588f96d27582131f394fde8035d9d5916b3571...


In [6]:
# merge two dataframes 
transcript_merged_df = pd.merge(transcript_df, transcript_json_df, on='hashed_transcript', how='left',
                                suffixes=('_csv', '_json'))
transcript_merged_df.sort_values(by='start_time', inplace=True)
transcript_merged_df

Unnamed: 0,start_time,end_time,language_code,confidence,speaker_tag,transcript_csv,hashed_transcript,transcript_json,less_confident_words
0,2.8,14.5,en-us,0.96046,1,Can you just speak to what it takes for great ...,a5350f3f9dfe18a3401a06f9972b31bab70158e3bed551...,Can you just speak to what it takes for great ...,"[{'word': 'the', 'index_in_transcript': 19}, {..."
1,15.4,18.1,en-us,0.84458,1,process of the process of constantly improvin...,7765622ac3bfe9d45ebd6d1418c06e5f34fe2726ad6190...,process of the process of constantly improvin...,"[{'word': 'of', 'index_in_transcript': 1}, {'w..."
2,22.9,29.3,en-us,0.82285,2,"Well, these are to say simplify as vertical t...",f244266073005f037d09105a548175753ed8d105419285...,"Well, these are to say simplify as vertical t...","[{'word': 'these', 'index_in_transcript': 1}, ..."
3,32.7,41.8,en-us,0.88606,2,You know how this very basic first basic firs...,6cdc50f14fc9676f34f8dcb7ce5fec772d6e77a844236e...,You know how this very basic first basic firs...,"[{'word': 'how', 'index_in_transcript': 2}, {'..."
4,44.3,48.9,en-us,0.72278,2,The requirements always down to some degree. ...,ac34c4be5f95eb462a15976a7445f3c91c2dd8b24e586f...,The requirements always down to some degree. ...,"[{'word': 'The', 'index_in_transcript': 0}, {'..."
...,...,...,...,...,...,...,...,...,...
88,489.4,497.2,en-us,0.83681,2,"look at the gray matter, which is the compute...",3c0ab2fae4413f984ce660e62cc68cd594907607e88449...,,
89,497.2,502.1,en-us,0.83681,1,like walking around in the super computer cen...,20078ff5635fc66c05669d581f7e12cdce2e95dbabe8de...,,
90,502.8,503.1,en-us,0.53844,1,"Yeah, well.",63ba1f77d332ea3b44f03b603d5061b69e9b8a599c1278...,"Yeah, well.","[{'word': 'Yeah,', 'index_in_transcript': 0}, ..."
91,504.1,505.5,en-us,0.90823,1,one day build a super intelligent,66f6b248588f96d27582131f394fde8035d9d5916b3571...,one day build a super intelligent,[]


## 5. Changing less confident words and combining the transcript into one conversation

In [7]:
# Combining the transcript into one conversation
less_confident_words_list = []
transcript_list = []

row_start_index = 0
transcript_itm = ""
less_confident_words_itm = []
prev_speaker_tag = None

for idx, row in transcript_merged_df.iterrows():
    if prev_speaker_tag is None or prev_speaker_tag == row['speaker_tag']:
        transcript_itm += row['transcript_csv'].strip() + " "

        if not pd.isnull(row['less_confident_words']):
            row_less_confident_words_list = eval(row['less_confident_words'])
            for word in row_less_confident_words_list:
                less_confident_words_itm.append({
                    'word': word['word'],
                    'index_in_transcript': row_start_index + word['index_in_transcript']
                })
    else:
        transcript_list.append({
            'speaker_tag': prev_speaker_tag,
            'transcript': transcript_itm.strip()
        })
        transcript_itm = row['transcript_csv'] + " "

        less_confident_words_list.append(less_confident_words_itm)
        less_confident_words_itm = []
        row_start_index = 0
        if not pd.isnull(row['less_confident_words']):
            row_less_confident_words_list = eval(row['less_confident_words'])
            for word in row_less_confident_words_list:
                less_confident_words_itm.append({
                    'word': word['word'],
                    'index_in_transcript': row_start_index + word['index_in_transcript']
                })

    # row_start_index = 1 if row_start_index == 0 else row_start_index  # this is needed when we compare the index of words in the transcript
    row_start_index += (len(row['transcript_csv'].split(" ")) - 1)
    prev_speaker_tag = row['speaker_tag']

transcript_list.append({
    'speaker_tag': prev_speaker_tag,
    'transcript': transcript_itm.strip()
})
less_confident_words_list.append(less_confident_words_itm)

print(f"len(less_confident_words_list) :- {len(less_confident_words_list)}")
less_confident_words_list

len(less_confident_words_list) :- 29


[[{'word': 'the', 'index_in_transcript': 19},
  {'word': 'supercomputer', 'index_in_transcript': 20},
  {'word': 'is', 'index_in_transcript': 22},
  {'word': 'of', 'index_in_transcript': 30},
  {'word': 'the', 'index_in_transcript': 31},
  {'word': 'of', 'index_in_transcript': 33}],
 [{'word': 'these', 'index_in_transcript': 1},
  {'word': 'are', 'index_in_transcript': 2},
  {'word': 'as', 'index_in_transcript': 6},
  {'word': 'how', 'index_in_transcript': 13},
  {'word': 'first', 'index_in_transcript': 17},
  {'word': 'first', 'index_in_transcript': 19},
  {'word': 'the', 'index_in_transcript': 36},
  {'word': 'The', 'index_in_transcript': 41},
  {'word': 'down', 'index_in_transcript': 44},
  {'word': 'to', 'index_in_transcript': 45},
  {'word': 'some', 'index_in_transcript': 46},
  {'word': 'if', 'index_in_transcript': 49},
  {'word': 'want', 'index_in_transcript': 51},
  {'word': 'to', 'index_in_transcript': 52},
  {'word': 'Sorrow', 'index_in_transcript': 53},
  {'word': 'by', 'ind

In [8]:
print(f"len(transcript_list) :- {len(transcript_list)}")
transcript_list

len(transcript_list) :- 29


[{'speaker_tag': 1,
  'transcript': 'Can you just speak to what it takes for great engineering team for you? What I saw in Memphis the supercomputer cluster is just this intense drive towards simplifying the process of the process of constantly improving it constantly iterating it.'},
 {'speaker_tag': 2,
  'transcript': "Well, these are to say simplify as vertical to do it. You know how this very basic first basic first principles algorithm that I run kind of as like a mantra which is the first question the requirements make the requirements. The requirements always down to some degree. So if you want to Sorrow by reducing the number of requirements. And nobody how smart person is gave you those requirements this little dumb just under three. If you have to start there because otherwise you could get the perfect answer to the wrong question. Though so try to make the question the least wrong possible. That's what question that requirements means and then the second thing is try to dele

In [9]:
podcast_name = "Lex Fridman Podcast"
podcast_topic = "Elon Musk's approach to problem-solving"
podcast_people = "Lex Fridman, Elon Musk"

# transcript = "And nobody how smart person is gave you those requirements this little dumb just under three."
# less_confident_words = "[{'word': 'And', 'index_in_transcript': 0}, {'word': 'nobody', 'index_in_transcript': 1}, {'word': 'how', 'index_in_transcript': 2}, {'word': 'smart', 'index_in_transcript': 3}, {'word': 'person', 'index_in_transcript': 4}, {'word': 'is', 'index_in_transcript': 5}, {'word': 'gave', 'index_in_transcript': 6}, {'word': 'you', 'index_in_transcript': 7}, {'word': 'this', 'index_in_transcript': 10}, {'word': 'little', 'index_in_transcript': 11}, {'word': 'dumb', 'index_in_transcript': 12}, {'word': 'just', 'index_in_transcript': 13}, {'word': 'under', 'index_in_transcript': 14}, {'word': 'three.', 'index_in_transcript': 15}]"

In [10]:
with open('vars.json') as f:
    data = json.load(f)

openai_api_key = data["open_ai_api_key"]
langchain_api_key = data["langchain_api_key"]
# tavily_api_key = data["tavily_api_key"]
groq_api_key = data["groq_api_key"]

In [11]:
from langchain_agents._models import get_llm

llm_llama3 = get_llm(llm_type='llama3', llm_model='llama3-70b-8192', api_key=groq_api_key)
llm_gpt = get_llm(llm_type='gpt', llm_model='gpt-4o-mini', api_key=openai_api_key)

In [12]:
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List
from langchain_core.tools import tool


class SingleTranscript(BaseModel):
    """
    The piece of transcript with the speaker identification
    """
    speaker_id: int = Field(description="Speaker identification number")
    transcript_str: str = Field(description="The piece of transcript the speaker said.")   


class HandleLessConfidentWordsOutput(BaseModel):
    """
    The output transcript.
    """
    transcript: List[SingleTranscript] = Field(
        description="The list of piece of changed transcripts with the speaker identification."
    )
    considerations: List[str] = Field([], description="A list things you considered when changing the transcript.")
    actions: List[str] = Field([], description="A list of actions you took to change the transcript.")

In [13]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

handle_less_confident_words_system_prompt = f"""\
You are an expert at casual, conversational english language. \
You have been provided with a transcript of a podcast between two people. \
The transcript has been generated using Google Speech-to-Text. \

Information about podcast: \
1. Name of the podcast: "{podcast_name}" \
2. Topic of the podcast: "{podcast_topic}" \
3. People in the podcast: "{podcast_people}" \

You have been provided with these: \
{{less_confident_words}} - less_confident_words_list (a list of lists of json objects). \
                           Each json object has two keys: word (the less confident word) and index_in_transcript (the index of the word in the transcript. index of the first word in the transcript is 0). \
                           Some internal lists can be empty. \
                           That means Google Speech-to-Text has not provided less confident words for the corresponding transcript. \
{{transcript}} - transcript_list (a list of json objects) \
                 Each json object has two keys: speaker_tag (the speaker identification number) and transcript (the piece of dialog the speaker said. This can be half of sentences and phrases.) \

i th item in less_confident_words_list corresponds to i th item in transcript_list. \

Follow these steps to make the original transcript better and provide the changed transcript : \
1. Go through the transcript and less_confident_words_list. \
2. You have to figure out words in the less_confident_words_list has to be changed or not to fit into the context. \
3. Identify the less confident words in the transcript which are not given in the less_confident_words_list, if there are any and change them to fit into the context. \
4. If you think the word has to be changed, you have to change it to a more suitable word. \
5. You have to make the transcript sound more natural. \
6. You have to make sure that the transcript is coherent and the words are in the right context. \
7. You have to make sure that each transcript makes sense. \
8. Provide the changed transcript after following the above steps, a list things you considered when changing the transcript and a list of actions you took to change the transcript. \
"""

handle_less_confident_words_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", handle_less_confident_words_system_prompt),
        ("human", "The less_confident_words is : {{less_confident_words}}. The transcript is : {{transcript}}."),
    ]
)

chain = handle_less_confident_words_prompt | llm_gpt.with_structured_output(schema=HandleLessConfidentWordsOutput)
# chain = handle_less_confident_words_prompt | llm_llama3.with_structured_output(schema=HandleLessConfidentWordsOutput)
# chain = handle_less_confident_words_prompt | llm_llama3 | StrOutputParser()

In [14]:
result = chain.invoke({
    "less_confident_words": less_confident_words_list,
    "transcript": transcript_list
})

In [15]:
for idx, transcript in enumerate(result.transcript):
    print(transcript.__dict__)

# print(result)

{'speaker_id': 1, 'transcript_str': 'Can you just speak to what it takes for a great engineering team for you? What I saw in Memphis, the supercomputer cluster is just this intense drive towards simplifying the process and constantly improving it, constantly iterating it.'}
{'speaker_id': 2, 'transcript_str': "Well, these are to say simplify as vertical to do it. You know how this very basic first principles algorithm that I run kind of as like a mantra, which is the first question: the requirements. Make the requirements. The requirements always come down to some degree. So if you want to reduce sorrow by reducing the number of requirements. And nobody, no matter how smart a person is, gave you those requirements, this little dumb just under three. If you have to start there because otherwise you could get the perfect answer to the wrong question. So try to make the question the least wrong possible. That's what the question that requirements means. And then the second thing is try to

In [16]:
print(result.considerations)

['Ensured coherence and context of the dialogue', 'Improved natural flow of conversation', 'Corrected any awkward phrasing or unclear references', 'Maintained the technical accuracy of the discussion']


In [17]:
print(result.actions)

["Changed 'Sorrow' to 'reduce sorrow' for clarity", 'Rephrased sentences for better flow and understanding', "Corrected 'caveling' to 'cabling' for accuracy", 'Adjusted punctuation for better readability', 'Removed redundant phrases to streamline the dialogue']
