In [1]:
import json
import os

# download ffmpeg (https://www.ffmpeg.org/download.html) and set the path
ffmpeg_path = r"C:\Users\Sudheera\Documents\ffmpeg-n6.1-latest-win64-gpl-6.1\ffmpeg-n6.1-latest-win64-gpl-6.1\bin"
os.environ['PATH'] += os.pathsep + ffmpeg_path

with open('vars.json') as f:
    data = json.load(f)

## 1. Download subtitles from YouTube

In [2]:
import yt_dlp


# Function to download subtitles
def download_subtitles(url, lang='en'):
    ydl_opts = {
        'writesubtitles': True,  # Download subtitles
        'writeautomaticsub': True,  # Download automatic subtitles generated by YouTube
        'subtitleslangs': [lang],  # Subtitle language (default is English)
        'subtitlesformat': 'srt',  # Format for subtitles (e.g., srt, vtt)
        'skip_download': True,  # Do not download the video file itself
        'restrictfilenames': True,  # Do not allow special characters in file names
        'outtmpl': '%(title)s.%(ext)s',  # Output file naming
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

In [3]:
# URL of the YouTube video
video_url = 'https://www.youtube.com/watch?v=CDZ9REOh2xA'

# Download subtitle
download_subtitles(video_url)

[youtube] Extracting URL: https://www.youtube.com/watch?v=CDZ9REOh2xA
[youtube] CDZ9REOh2xA: Downloading webpage
[youtube] CDZ9REOh2xA: Downloading ios player API JSON
[youtube] CDZ9REOh2xA: Downloading web creator player API JSON
[youtube] CDZ9REOh2xA: Downloading m3u8 information
[info] CDZ9REOh2xA: Downloading subtitles: en




[info] CDZ9REOh2xA: Downloading 1 format(s): 616+251
[info] Writing video subtitles to: Elon_Musk_s_approach_to_problem-solving_Lex_Fridman_Podcast.en.vtt
[download] Destination: Elon_Musk_s_approach_to_problem-solving_Lex_Fridman_Podcast.en.vtt
[download] 100% of   68.77KiB in 00:00:00 at 76.26KiB/s


## 2. Download audio file from Youtube and transcribe it using Google Speech to Text

In [6]:
import yt_dlp


def download_audio(youtube_url, output_file):
    ydl_opts = {
        'format': 'bestaudio/best',  # Select the best available audio quality
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3'  # You can change this to 'wav', 'm4a', etc.
        }],
        'outtmpl': output_file,  # Output file name and path
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])

    return True

In [8]:
# Download Audio
download_audio(video_url, 'audio_new')

[youtube] Extracting URL: https://www.youtube.com/watch?v=CDZ9REOh2xA
[youtube] CDZ9REOh2xA: Downloading webpage
[youtube] CDZ9REOh2xA: Downloading ios player API JSON
[youtube] CDZ9REOh2xA: Downloading web creator player API JSON
[youtube] CDZ9REOh2xA: Downloading m3u8 information
[info] CDZ9REOh2xA: Downloading 1 format(s): 251
[download] Destination: audio_new
[download] 100% of    6.53MiB in 00:00:07 at 928.98KiB/s 
[ExtractAudio] Destination: audio_new.mp3
Deleting original file audio_new (pass -k to keep)


True

In [9]:
audio_file_path = r"audio_new.mp3"

import io
from pydub import AudioSegment

# Read the audio file into memory using io.BytesIO
with open(audio_file_path, 'rb') as audio_file:
    audio_data = io.BytesIO(audio_file.read())

    # Load the audio from the in-memory file
    audio = AudioSegment.from_file(audio_data, format="mp3")

    # Print the sampling rate
    print(f"Sampling Rate: {audio.frame_rate} Hz")

Sampling Rate: 48000 Hz


## 3. Transcribe the audio using Google Speech to Text

## 4. Pre-process downloaded transcript

In [1]:
# After transcribing an audio file you can download a csv file and a Json file from Google Speech-to-text
# csv :- transcript, confidence, speaker tag, language code, start time, end time (some words can be wrong)
# json :- Additional information you have here is the confidence of each word

# first we will read files separately and create one dataframe with necessary data 

import pandas as pd
import json

csv_file = 'audio.csv'
json_file = 'audio.json'

csv_column_names = {
    'Start time': 'start_time',
    'End time': 'end_time',
    'Language code': 'language_code',
    'Confidence': 'confidence',
    'Channel': 'channel',
    'Speaker tag': 'speaker_tag',
    'Transcript': 'transcript'
}

# Json file format :- 
# {
#    "results": [
#        { "alternatives": [
#               {
#                   "confidence": _
#                   "transcript": _
#                   "words": [
#                       {
#                           "confidence": _,
#                           "endTime": _,
#                           "startTime": _,
#                           "word": _
#                       }, ...
#           ],
#           "languageCode": _,
#           "resultEndTime": _
#       }, ...
#       ...
#       ...
#       {                                           <- last item of the list (after all transcriptions)
#           "alternatives": {
#               "words": [
#                   {
#                      "confidence": _,
#                      "endTime": _,
#                      "speakerLabel": _,
#                      "speakerTag": _,
#                      "startTime": _,
#                      "word": _
#                   },....                          <- one item for each word
#               ]
#           }
#       }
#   ]
# }

In [2]:
# Read the csv file
import hashlib


def get_sha256_hash(input_string):
    encoded_string = input_string.encode()
    sha256_hash = hashlib.sha256(encoded_string)
    return sha256_hash.hexdigest()


transcript_df = pd.read_csv(csv_file)
transcript_df.rename(columns=csv_column_names, inplace=True)
transcript_df.drop(columns=['channel'], inplace=True)
transcript_df['hashed_transcript'] = transcript_df['transcript'].apply(get_sha256_hash)
transcript_df

Unnamed: 0,start_time,end_time,language_code,confidence,speaker_tag,transcript,hashed_transcript
0,2.8,14.5,en-us,0.96046,1,Can you just speak to what it takes for great ...,a5350f3f9dfe18a3401a06f9972b31bab70158e3bed551...
1,15.4,18.1,en-us,0.84458,1,process of the process of constantly improvin...,7765622ac3bfe9d45ebd6d1418c06e5f34fe2726ad6190...
2,22.9,29.3,en-us,0.82285,2,"Well, these are to say simplify as vertical t...",f244266073005f037d09105a548175753ed8d105419285...
3,32.7,41.8,en-us,0.88606,2,You know how this very basic first basic firs...,6cdc50f14fc9676f34f8dcb7ce5fec772d6e77a844236e...
4,44.3,48.9,en-us,0.72278,2,The requirements always down to some degree. ...,ac34c4be5f95eb462a15976a7445f3c91c2dd8b24e586f...
...,...,...,...,...,...,...,...
86,489.4,497.2,en-us,0.83681,2,"look at the gray matter, which is the compute...",3c0ab2fae4413f984ce660e62cc68cd594907607e88449...
87,497.2,502.1,en-us,0.83681,1,like walking around in the super computer cen...,20078ff5635fc66c05669d581f7e12cdce2e95dbabe8de...
88,502.8,503.1,en-us,0.53844,1,"Yeah, well.",63ba1f77d332ea3b44f03b603d5061b69e9b8a599c1278...
89,504.1,505.5,en-us,0.90823,1,one day build a super intelligent,66f6b248588f96d27582131f394fde8035d9d5916b3571...


In [3]:
print(f"number of transcripts :- {transcript_df.shape[0]}")

number of transcripts :- 91


In [4]:
# Load data from JSON file
with open(json_file, 'r') as f:
    transcript_json = json.load(f)

print(len(transcript_json['results']))

76


In [5]:
transcript_json_df_cols = ['transcript', 'less_confident_words']

# less_confident_words = [{
#    "word": _,
#    "confidence": _
#    "index_in_transcript": _
#    }, ...]

transcript_json_df = pd.DataFrame(columns=transcript_json_df_cols)


def _get_less_confident_words_(words):
    less_confident_words = []
    for idx, word in enumerate(words):
        if word['confidence'] < 0.8:
            less_confident_words.append({
                'word': word['word'],
                'confidence': word['confidence'],
                'index_in_transcript': idx
            })
    return less_confident_words


for itm in transcript_json['results'][:-1]:
    transcript_json_row_data = {'transcript': [itm['alternatives'][0]['transcript']],
                                'less_confident_words': str(_get_less_confident_words_(itm['alternatives'][0]['words']))
                                }
    transcript_json_df = pd.concat([transcript_json_df, pd.DataFrame.from_dict(transcript_json_row_data)], ignore_index=True)

transcript_json_df['hashed_transcript'] = transcript_json_df['transcript'].apply(get_sha256_hash)
transcript_json_df

Unnamed: 0,transcript,less_confident_words,hashed_transcript
0,Can you just speak to what it takes for great ...,"[{'word': 'the', 'confidence': 0.76299578, 'in...",a5350f3f9dfe18a3401a06f9972b31bab70158e3bed551...
1,process of the process of constantly improvin...,"[{'word': 'of', 'confidence': 0.6193794, 'inde...",7765622ac3bfe9d45ebd6d1418c06e5f34fe2726ad6190...
2,"Well, these are to say simplify as vertical t...","[{'word': 'these', 'confidence': 0.46206599, '...",f244266073005f037d09105a548175753ed8d105419285...
3,You know how this very basic first basic firs...,"[{'word': 'how', 'confidence': 0.23147941, 'in...",6cdc50f14fc9676f34f8dcb7ce5fec772d6e77a844236e...
4,The requirements always down to some degree. ...,"[{'word': 'The', 'confidence': 0.39299181, 'in...",ac34c4be5f95eb462a15976a7445f3c91c2dd8b24e586f...
...,...,...,...
70,"It looks pretty cool. Yeah, it's like it's li...","[{'word': 'the', 'confidence': 0.76178557, 'in...",ee14698c0a54d64462b3858ada257156ecc0139bd16ea8...
71,brain tissue is the cables. Yeah. So look at ...,"[{'word': 'brain', 'confidence': 0.68274397, '...",156fcdf6552db47fc22afdeb8b752420a1c3407f53aa55...
72,"Yeah, well.","[{'word': 'Yeah,', 'confidence': 0.50714225, '...",63ba1f77d332ea3b44f03b603d5061b69e9b8a599c1278...
73,one day build a super intelligent,[],66f6b248588f96d27582131f394fde8035d9d5916b3571...


In [6]:
# merge two dataframes 
transcript_merged_df = pd.merge(transcript_df, transcript_json_df, on='hashed_transcript', how='left', suffixes=('_csv', '_json'))
transcript_merged_df.sort_values(by='start_time', inplace=True)
transcript_merged_df

Unnamed: 0,start_time,end_time,language_code,confidence,speaker_tag,transcript_csv,hashed_transcript,transcript_json,less_confident_words
0,2.8,14.5,en-us,0.96046,1,Can you just speak to what it takes for great ...,a5350f3f9dfe18a3401a06f9972b31bab70158e3bed551...,Can you just speak to what it takes for great ...,"[{'word': 'the', 'confidence': 0.76299578, 'in..."
1,15.4,18.1,en-us,0.84458,1,process of the process of constantly improvin...,7765622ac3bfe9d45ebd6d1418c06e5f34fe2726ad6190...,process of the process of constantly improvin...,"[{'word': 'of', 'confidence': 0.6193794, 'inde..."
2,22.9,29.3,en-us,0.82285,2,"Well, these are to say simplify as vertical t...",f244266073005f037d09105a548175753ed8d105419285...,"Well, these are to say simplify as vertical t...","[{'word': 'these', 'confidence': 0.46206599, '..."
3,32.7,41.8,en-us,0.88606,2,You know how this very basic first basic firs...,6cdc50f14fc9676f34f8dcb7ce5fec772d6e77a844236e...,You know how this very basic first basic firs...,"[{'word': 'how', 'confidence': 0.23147941, 'in..."
4,44.3,48.9,en-us,0.72278,2,The requirements always down to some degree. ...,ac34c4be5f95eb462a15976a7445f3c91c2dd8b24e586f...,The requirements always down to some degree. ...,"[{'word': 'The', 'confidence': 0.39299181, 'in..."
...,...,...,...,...,...,...,...,...,...
88,489.4,497.2,en-us,0.83681,2,"look at the gray matter, which is the compute...",3c0ab2fae4413f984ce660e62cc68cd594907607e88449...,,
89,497.2,502.1,en-us,0.83681,1,like walking around in the super computer cen...,20078ff5635fc66c05669d581f7e12cdce2e95dbabe8de...,,
90,502.8,503.1,en-us,0.53844,1,"Yeah, well.",63ba1f77d332ea3b44f03b603d5061b69e9b8a599c1278...,"Yeah, well.","[{'word': 'Yeah,', 'confidence': 0.50714225, '..."
91,504.1,505.5,en-us,0.90823,1,one day build a super intelligent,66f6b248588f96d27582131f394fde8035d9d5916b3571...,one day build a super intelligent,[]


## 5. Changing less confident words and combining the transcript into one conversation

In [7]:
podcast_name = "Lex Fridman Podcast"
podcast_topic = "Elon Musk's approach to problem-solving"
podcast_people = "Lex Fridman, Elon Musk"

transcript = "Can you just speak to what it takes for great engineering team for you? What I saw in Memphis the supercomputer cluster is just this intense drive towards simplifying the"
less_confident_words = "[{'word': 'the', 'index_in_transcript': 19}, {'word': 'supercomputer', 'index_in_transcript': 20}, {'word': 'is', 'index_in_transcript': 22}]"

In [8]:
with open('vars.json') as f:
  data = json.load(f)

openai_api_key = data["open_ai_api_key"]
langchain_api_key = data["langchain_api_key"]
# tavily_api_key = data["tavily_api_key"]
groq_api_key = data["groq_api_key"]

In [9]:
from langchain_agents._models import get_llm

llm_llama3 = get_llm(llm_type='llama3', llm_model='llama3-70b-8192', api_key=groq_api_key)
llm_gpt = get_llm(llm_type='gpt', llm_model='gpt-3.5-turbo', api_key=openai_api_key)

In [11]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

handle_less_confident_words_system_prompt = f"""\
You are an expert at casual, conversational english language. \
You have been provided with a transcript of a podcast between two people. \
The transcript has been generated using Google Speech-to-Text. \

Information about podcast: \
1. Name of the podcast: "{podcast_name}" \
2. Topic of the podcast: "{podcast_topic}" \
3. People in the podcast: "{podcast_people}" \

You have been provided with these: \
{{less_confident_words}} - less_confident_words (a list of json objects) \
{{transcript}} - transcript (a string) \

In less_confident_words json string, you have been provided less confident words with their index in the transcript. \
(index of the first word in the transcript is 0) \

These are the tasks you need to perform: \
1. You have to have a look at less_confident_words and figure out those words has to be changed or not. \
2. If you think the word has to be changed, you have to change it to a more suitable word. \
3. You have to review the transcript and make the conversation sound more natural. \
"""

handle_less_confident_words_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", handle_less_confident_words_system_prompt),
        ("human", "The less_confident_words is : {{less_confident_words}}. The transcript is : {{transcript}}."),
    ]
)

chain = handle_less_confident_words_prompt | llm_llama3 | StrOutputParser()

In [12]:
result = chain.invoke({
    "less_confident_words": less_confident_words,
    "transcript": transcript
})

In [14]:
print(result)

Based on the provided transcript and less_confident_words, I'll review and make the necessary changes to make the conversation sound more natural.

The transcript is: "Can you just speak to what it takes for great engineering team for you? What I saw in Memphis the supercomputer cluster is just this intense drive towards simplifying the -"

The less_confident_words are: [{'word': 'the', 'index_in_transcript': 19}, {'word': 'supercomputer', 'index_in_transcript': 20}, {'word': 'is', 'index_in_transcript': 22}]

After reviewing the transcript and less_confident_words, I suggest the following changes:

1. The word "the" at index 19 seems correct in the context, so no change is needed.
2. The word "supercomputer" at index 20 seems correct in the context, so no change is needed.
3. The word "is" at index 22 could be changed to "was" to make the sentence more grammatically correct and natural-sounding.

Here's the revised transcript:

"Can you just speak to what it takes for a great engineer