<a href="https://colab.research.google.com/github/FaizanCod/agentic-ai/blob/master/Faizan_Speech2Text_v1_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task
Create a time-coded JSON output of topics extracted from a video, given a video URL.

In [3]:
!pip install -q yt-dlp
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q torch # Ensure PyTorch is installed for Whisper
!pip install -q torchaudio
!pip install -q ffmpeg-python

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.0/180.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone


In [5]:
import whisper
import yt_dlp
import os
import re
import json
from google.colab import drive

In [6]:
YOUTUBE_URL = "https://www.youtube.com/watch?v=gC88zq9Y4gQ"  # REPLACE with your video URL
OUTPUT_DIR = "/content/transcription_output/"
AUDIO_FILE = os.path.join(OUTPUT_DIR, "audio.mp3")
TRANSCRIPT_FILE_NAME = "final_transcript_grouped.txt"
TRANSCRIPT_FILE_PATH = os.path.join(OUTPUT_DIR, TRANSCRIPT_FILE_NAME)


In [7]:
# Create the output directory if it doesn't exist
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

print(f"Target URL: {YOUTUBE_URL}")
print(f"Output directory: {OUTPUT_DIR}")

Target URL: https://www.youtube.com/watch?v=gC88zq9Y4gQ
Output directory: /content/transcription_output/


In [8]:
print("\n--- Starting Audio Extraction (The Robust Ingestion Tool) ---")

try:
    # 1. Store the list of files present before download
    files_before = set(os.listdir(OUTPUT_DIR))

    # 2. Define yt-dlp options for audio extraction and conversion
    # We use a generic outtmpl pattern to ensure consistency, but won't rely on predicting the filename.
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': os.path.join(OUTPUT_DIR, '%(title)s_yt_dlp_temp'), # Simple, temporary base name
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',  # Extract audio
            'preferredcodec': 'mp3',      # Convert to MP3 format
            'preferredquality': '192',    # Audio quality
        }],
        'logger': None,
        'quiet': True,
        'no_warnings': True,
    }

    # 3. Download and convert the audio file
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.extract_info(YOUTUBE_URL, download=True)

    # 4. Find the newly downloaded file by comparing file lists
    files_after = set(os.listdir(OUTPUT_DIR))
    new_files = list(files_after - files_before)

    # Search the new files list for the file that ends with .mp3
    downloaded_file_name = next((f for f in new_files if f.endswith('.mp3')), None)

    if not downloaded_file_name:
        # If yt-dlp ran but didn't produce an mp3 file, this handles the error.
        raise FileNotFoundError("yt-dlp ran, but no new .mp3 file was found in the output directory. Check video content or restrictions.")

    downloaded_file_path = os.path.join(OUTPUT_DIR, downloaded_file_name)

    # 5. Rename the file to the standardized AUDIO_FILE path for Whisper
    # This step is now guaranteed to work because downloaded_file_path is confirmed to exist.
    os.rename(downloaded_file_path, AUDIO_FILE)

    print(f"Audio successfully downloaded and standardized to: {AUDIO_FILE}")

except yt_dlp.utils.DownloadError as e:
    print("\n--- ERROR: YT-DLP Download Failed ---")
    if "Private video" in str(e):
        print("The video is likely private or restricted.")
    elif "Video unavailable" in str(e):
        print("The video is unavailable, deleted, or has strict geo-restrictions.")
    else:
        print(f"A download error occurred: {e}")
    # Halt execution if audio retrieval failed
    raise
except FileNotFoundError as e:
    print(f"\n--- FILE SYSTEM ERROR ---")
    print(f"The downloaded file was not found during the rename operation: {e}")
    print("This often means the download process failed silently or the file extension was unexpected.")
    raise
except Exception as e:
    print(f"\n--- GENERIC EXTRACTION ERROR ---")
    print(f"An unexpected error occurred during audio extraction: {e}")
    raise


--- Starting Audio Extraction (The Robust Ingestion Tool) ---
Audio successfully downloaded and standardized to: /content/transcription_output/audio.mp3


In [9]:
def group_segments_by_context(segments, max_gap_sec=2.0):
        """
        Merges adjacent Whisper segments into a single block if the time gap
        between them is less than a predefined maximum gap (e.g., 2.0 seconds).
        This creates more contextually complete entries for the LLM.
        """
        if not segments:
            return []

        grouped_output = []
        current_group = {
            "start_sec": segments[0]["start_sec"],
            "text": segments[0]["text"]
        }

        for i in range(1, len(segments)):
            prev_end = segments[i-1]["end_sec"]
            current_start = segments[i]["start_sec"]

            # If the gap is small, merge the text
            if (current_start - prev_end) < max_gap_sec:
                current_group["text"] += " " + segments[i]["text"]
            else:
                # If the gap is large, start a new group
                grouped_output.append(current_group)
                current_group = {
                    "start_sec": current_start,
                    "text": segments[i]["text"]
                }

        # Append the last remaining group
        grouped_output.append(current_group)
        return grouped_output

In [10]:
print("\n--- Starting Transcription (The Analysis Tool) ---")

try:
    # 1. Load the Whisper model.
    # 'base' is fast and good for quick tests. Use 'medium' or 'large' for production.
    model = whisper.load_model("base")
    print("Whisper model loaded successfully.")

    # 2. Transcribe the audio, ensuring the 'word_timestamps' option is true
    result = model.transcribe(
        AUDIO_FILE,
        verbose=True,
        word_timestamps=True
    )

    # 3. Process the raw segments for merging
    raw_segments = []
    for segment in result['segments']:
        raw_segments.append({
            "start_sec": segment['start'],
            "end_sec": segment['end'],
            "text": segment['text'].strip()
        })

    context_grouped_segments = group_segments_by_context(raw_segments, max_gap_sec=2.0)

    # =================================================================
    # STEP 5: OUTPUT THE FINAL TIME-CODED TRANSCRIPT
    # =================================================================

    def format_time(seconds):
        hours, remainder = divmod(int(seconds), 3600)
        minutes, seconds = divmod(remainder, 60)
        return f"{hours:02d}:{minutes:02d}:{seconds:02d}"

    print("\n--- Final Contextually Grouped Transcript for AI Agent ---")

    final_transcript = ""
    for segment in context_grouped_segments:
        formatted_start = format_time(segment['start_sec'])
        # Concatenate the time and text into the format your Agent can read
        final_transcript += f"{formatted_start} {segment['text']}\n"
        print(f"{formatted_start} {segment['text']}")

    # Save the final transcript to a text file for review (optional)
    transcript_file = os.path.join(OUTPUT_DIR, "final_transcript_grouped.txt")
    with open(transcript_file, "w", encoding="utf-8") as f:
        f.write(final_transcript)

    print(f"\nSuccessfully transcribed and grouped {len(context_grouped_segments)} segments.")
    print(f"Transcript saved to: {transcript_file}")

except Exception as e:
    print(f"An error occurred during transcription: {e}")
    print("Please check the Colab logs for detailed error messages.")


--- Starting Transcription (The Analysis Tool) ---


100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 216MiB/s]


Whisper model loaded successfully.




Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: English
[00:01.980 --> 00:07.120]  Piazztry is 15, has to get one in as well, is on a better lap, but so is everyone out there.
[00:07.240 --> 00:09.320]  So no, there's having a real tough time of it though.
[00:09.680 --> 00:13.160]  Science finds himself in third, lock up for Bermann, won't make the corner,
[00:13.400 --> 00:17.820]  stroll goes fast as Piazztry gets his lap in when he needed it to the barrier.
[00:18.400 --> 00:22.660]  For Oliver Bermann, scape-ting deep into the run off, the yellow flag is out,
[00:22.800 --> 00:26.900]  for Stappen goes to second place, Antonelli is now in the drop zone.
[00:27.460 --> 00:32.680]  30 seconds to go, Alex Albin slowing the car down into the corner that we were talking about,
[00:32.860 --> 00:35.820]  where the yellow flag is out, you can see it there. Now it's been withdrawn.
[00:36.240 --> 00:40.400]  Colour Pinto goe

In [11]:
print("\n--- Starting Google Drive Upload ---")

# 1. Mount Google Drive
drive_mount_path = "/content/gdrive"
drive.mount(drive_mount_path)

# 2. Define the destination path in your Drive
# This will create a folder named 'AI_Transcripts' in the root of your Google Drive
drive_destination_folder = os.path.join(drive_mount_path, 'MyDrive', 'AI_Transcripts')

if not os.path.exists(drive_destination_folder):
    os.makedirs(drive_destination_folder)
    print(f"Created destination folder: {drive_destination_folder}")

# 3. Copy the file using a shell command (cp)
drive_destination_path = os.path.join(drive_destination_folder, TRANSCRIPT_FILE_NAME)
!cp "{TRANSCRIPT_FILE_PATH}" "{drive_destination_path}"

print("\n-------------------------------------------")
print("✅ Upload Successful!")
print(f"The file has been saved to your Google Drive at:")
print(f"MyDrive/AI_Transcripts/{TRANSCRIPT_FILE_NAME}")
print("-------------------------------------------")


--- Starting Google Drive Upload ---
Mounted at /content/gdrive
Created destination folder: /content/gdrive/MyDrive/AI_Transcripts

-------------------------------------------
✅ Upload Successful!
The file has been saved to your Google Drive at:
MyDrive/AI_Transcripts/final_transcript_grouped.txt
-------------------------------------------
