# Pre training video dataset creating from YouTube videos



In [None]:
%cd /content/drive/MyDrive/pre_training_phase_data

In [None]:
!ls

In [None]:
import os

videos_list = os.listdir("./videos")
print(f"Number of Videos {len(videos_list)}")
print(f"Number of Audios {len(os.listdir('./audio'))}")
print(f"Number of transcripts {len(os.listdir('./transcripts'))}")
print(f"Number of refined transcripts: {len(os.listdir('./refined_transcripts/'))}")

# Download YouTube Videos


In [None]:
!pip install -q yt-dlp pandas

In [None]:
import yt_dlp
import os
import pandas as pd
from datetime import datetime
import shutil
import subprocess
import time  # Import the time module


def is_ffmpeg_installed():
    """Check if FFmpeg is installed and available in the system's PATH."""
    return shutil.which("ffmpeg") is not None


def extract_audio_ffmpeg(video_filepath: str, audio_dir: str) -> str | None:
    """
    Extracts audio from a video file using FFmpeg, converting it to 16kHz mono WAV.

    Args:
        video_filepath: The full path to the input video file.
        audio_dir: The directory where the extracted audio will be saved.

    Returns:
        The filename of the extracted audio file if successful, otherwise None.
    """
    if not os.path.exists(video_filepath):
        print(f"‚ùå Error: Video file not found at {video_filepath}")
        return None

    try:
        video_basename = os.path.basename(video_filepath)
        video_name_no_ext = os.path.splitext(video_basename)[0]
        audio_filename = f"{video_name_no_ext}.wav"
        output_audio_path = os.path.join(audio_dir, audio_filename)

        print(f"üéµ Extracting audio from '{video_basename}'...")

        # Command to extract audio, convert to PCM 16-bit little-endian,
        # set sample rate to 16kHz, mono channel, and overwrite output
        command = [
            'ffmpeg', '-i', video_filepath, '-vn', '-acodec', 'pcm_s16le',
            '-ar', '16000', '-ac', '1', '-y', output_audio_path
        ]

        # Run ffmpeg, suppressing stdout and stderr to keep the log clean
        subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        print(f"‚úÖ Audio extracted: {output_audio_path}")
        return audio_filename

    except subprocess.CalledProcessError:
        print(f"‚ùå FFmpeg error during audio extraction for {video_filepath}.")
        return None
    except Exception as e:
        print(f"‚ùå Unexpected error during audio extraction: {e}")
        return None


def download_video_and_extract_audio(video_url: str,
                                     output_dir: str = './videos',
                                     audio_dir: str = './audio',
                                     metadata_file: str = './videos/video_metadata.csv',
                                     cookie_file: str | None = None):
    """
    Downloads a YouTube video, extracts its audio, logs metadata, and skips processed videos.
    """
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(audio_dir, exist_ok=True)

    # Define the metadata columns
    metadata_columns = [
        'title', 'channel_name', 'url', 'filename',
        'download_date', 'duration_seconds', 'resolution', 'audio_filename'
    ]

    # Load or initialize metadata DataFrame
    if os.path.exists(metadata_file):
        try:
            metadata_df = pd.read_csv(metadata_file)
            # Ensure all required columns exist (for backward compatibility)
            for col in metadata_columns:
                if col not in metadata_df.columns:
                    metadata_df[col] = None
            # Reorder columns for consistency
            metadata_df = metadata_df[metadata_columns]
        except pd.errors.EmptyDataError:
            metadata_df = pd.DataFrame(columns=metadata_columns)
    else:
        metadata_df = pd.DataFrame(columns=metadata_columns)

    # Skip if video URL already processed
    if video_url in metadata_df['url'].values:
        print(f"‚è© Video already in metadata (skipped): {video_url}")
        return

    # yt-dlp options
    ydl_opts = {
        # Get 480p video + best audio, merge into mp4
        'format': 'bestvideo[height=480][ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
        'outtmpl': os.path.join(output_dir, '%(title)s.%(ext)s'),
        'noplaylist': True,
        'merge_output_format': 'mp4',
        'postprocessors': [{'key': 'FFmpegMetadata', 'add_chapters': False}],
        'retries': 5,
        'fragment_retries': 5,
        'no_warnings': True, # Suppress warnings (like SABR)
    }

    if cookie_file:
        ydl_opts['cookiefile'] = cookie_file

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            # Extract info first without downloading
            info = ydl.extract_info(video_url, download=False)
            video_title = str(info.get('title', 'Unknown Title'))
            channel_name = info.get('uploader', 'Unknown Channel')
            duration = info.get('duration')
            width, height = info.get('width'), info.get('height')
            resolution = f"{width}x{height}" if width and height else "N/A"

            # Get the expected downloaded video path
            expected_video_path = ydl.prepare_filename(info)
            video_name_no_ext = os.path.splitext(os.path.basename(expected_video_path))[0]
            expected_audio_filename = f"{video_name_no_ext}.wav"
            expected_audio_path = os.path.join(audio_dir, expected_audio_filename)

            # Skip if audio file already exists
            if os.path.exists(expected_audio_path):
                print(f"‚è© Audio already exists, assuming processed: {expected_audio_filename}")
                # Log metadata if it was missing (e.g., script interrupted)
                if video_url not in metadata_df['url'].values:
                    new_entry = pd.DataFrame([{
                        'title': video_title,
                        'channel_name': channel_name,
                        'url': video_url,
                        'filename': os.path.basename(expected_video_path),
                        'download_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        'duration_seconds': duration,
                        'resolution': resolution,
                        'audio_filename': expected_audio_filename
                    }])
                    metadata_df = pd.concat([metadata_df, new_entry], ignore_index=True)
                    metadata_df.to_csv(metadata_file, index=False)
                return

            print(f"‚¨áÔ∏è Downloading: '{video_title}' from channel: {channel_name}")
            ydl.download([video_url])

            # Verify download and extract audio
            if os.path.exists(expected_video_path):
                print(f"‚úÖ Download complete: {os.path.basename(expected_video_path)}")
                audio_filename = extract_audio_ffmpeg(expected_video_path, audio_dir)

                # Log new entry to metadata
                new_entry = pd.DataFrame([{
                    'title': video_title,
                    'channel_name': channel_name,
                    'url': video_url,
                    'filename': os.path.basename(expected_video_path),
                    'download_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'duration_seconds': duration,
                    'resolution': resolution,
                    'audio_filename': audio_filename if audio_filename else "N/A"
                }])
                metadata_df = pd.concat([metadata_df, new_entry], ignore_index=True)
                metadata_df.to_csv(metadata_file, index=False)
            else:
                print(f"‚ùå Download reported success but file not found at '{expected_video_path}'")

    except yt_dlp.utils.DownloadError as e:
        print(f"‚ùå yt-dlp Download Error for {video_url}: {e}")
    except Exception as e:
        print(f"‚ùå Unexpected error for {video_url}: {e}")


def verify_and_process_existing_videos(videos_dir: str, audio_dir: str):
    """
    Scans the videos directory and extracts audio for any video missing its corresponding .wav file.
    """
    print("\n--- Verifying Existing Videos ---")
    if not os.path.isdir(videos_dir):
        print(f"‚ùå Verification skipped: '{videos_dir}' not found")
        return

    # Get a set of audio filenames (without extension)
    existing_audio_names = {os.path.splitext(f)[0] for f in os.listdir(audio_dir) if f.endswith('.wav')}
    video_files = [f for f in os.listdir(videos_dir) if f.endswith(('.mp4', '.mkv', '.webm', '.mov'))]

    # Find videos where the filename (without extension) is not in the audio set
    missing_audio_videos = [
        os.path.join(videos_dir, vf)
        for vf in video_files
        if os.path.splitext(vf)[0] not in existing_audio_names
    ]

    if not missing_audio_videos:
        print("‚úÖ All videos have corresponding audio files.")
        return

    print(f"‚ö†Ô∏è Found {len(missing_audio_videos)} video(s) missing audio:")
    for v in missing_audio_videos:
        print(f"  - {os.path.basename(v)}")

    success, fail = 0, 0
    for vpath in missing_audio_videos:
        if extract_audio_ffmpeg(vpath, audio_dir):
            success += 1
        else:
            fail += 1

    print("\n--- Verification Summary ---")
    print(f"‚úÖ Extracted: {success}")
    print(f"‚ùå Failed: {fail}")


if __name__ == '__main__':
    # Check for FFmpeg installation
    if not is_ffmpeg_installed():
        print("=" * 60)
        print("‚ö†Ô∏è FFmpeg is not installed or not in your system PATH.")
        print("   This script requires FFmpeg to extract audio.")
        print("   Download from: https://ffmpeg.org/download.html")
        print("=" * 60)
    else:
        print("‚úÖ FFmpeg found.")

    # Define directories and files
    VIDEOS_DIRECTORY = './videos'
    AUDIO_DIRECTORY = './audio'
    METADATA_FILE = os.path.join(VIDEOS_DIRECTORY, 'video_metadata.csv')

    # List of videos to download
    video_urls = [
    # "https://www.youtube.com/watch?v=OZ5SmpNFlU8&pp=ygUZY2F0YXJhY3Qgc3VyZ2VyeSBuYXJyYXRlZA%3D%3D",
    # "https://youtube.com/playlist?list=PLjgfi4kp5BU7Lxk_O9GLCFzjXITHVonC0&si=ITWCQq8LzXfFRRKv",
    # "https://youtube.com/playlist?list=PL88gdqtVPZep0oIpxOP1AStJPK0-Q2hEj&si=-NwuoIQ1VDi5M-Al",
    # "https://youtube.com/playlist?list=PL88gdqtVPZeq0B1DAszU4uUmXbsksZsUf&si=vuURrNS4U10eE3hX",
    # "https://youtube.com/playlist?list=PL88gdqtVPZepBwG8e9A4HMb-cy0DJmwvp&si=B6XLDmBGXNpYZ7GX",
    # "https://www.youtube.com/playlist?list=PL88gdqtVPZerlilEVmKR-s3RTADHniLqu",
    # "https://youtube.com/playlist?list=PLker0kXqgiOUV92h5wszstCLFQ-JFKATf&si=2OunF06tse7abgHt",
     "https://youtube.com/playlist?list=PLker0kXqgiOVD5NNEES1UAYG7e-T6phV7&si=BQDJzsn_PHp_xFuK",
     "https://youtu.be/-Q4uQ6rEExs?si=4mL-Ysen96rifmsb",
    "https://youtu.be/LDhPCHvGxeA?si=xtkqrHo5gRzvUe4p",
    "https://youtu.be/XjZ5r8GZq5Y?si=sr9ay6WbkXjx-RH2",
    "https://youtu.be/pVTa7UhsyNc?si=2QFVOC-iNL_cSLRF",
    "https://www.youtube.com/watch?v=PLSKmeAV43M",
    "https://youtu.be/soJWJtHoplc?si=HROqo7yRTnzffEka",
    "https://www.youtube.com/watch?v=PLSKmeAV43M",
    "https://www.youtube.com/watch?v=QbeI72QmFAU&pp=ygUZY2F0YXJhY3Qgc3VyZ2VyeSBuYXJyYXRlZA%3D%3D",
    "https://www.youtube.com/watch?v=6aIOKqBA-64",
    "https://www.youtube.com/watch?v=yaAcqYn-Teo&pp=ygUZY2F0YXJhY3Qgc3VyZ2VyeSBuYXJyYXRlZA%3D%3D",
    "https://www.youtube.com/watch?v=O60ZbtRcjik&pp=ygUZY2F0YXJhY3Qgc3VyZ2VyeSBuYXJyYXRlZA%3D%3D",
    "https://www.youtube.com/watch?v=OZ5SmpNFlU8&pp=ygUZY2F0YXJhY3Qgc3VyZ2VyeSBuYXJyYXRlZA%3D%3D",
    "https://www.youtube.com/watch?v=n_3cG9oeuNo&pp=ygUZY2F0YXJhY3Qgc3VyZ2VyeSBuYXJyYXRlZA%3D%3D",
    "https://www.youtube.com/watch?v=-Q4uQ6rEExs&pp=ygUZY2F0YXJhY3Qgc3VyZ2VyeSBuYXJyYXRlZA%3D%3D",
    "https://www.youtube.com/watch?v=moF1tUd9Flc&pp=ygUZY2F0YXJhY3Qgc3VyZ2VyeSBuYXJyYXRlZA%3D%3D",
    "https://www.youtube.com/watch?v=aohAHNYpAOs&pp=ygUZY2F0YXJhY3Qgc3VyZ2VyeSBuYXJyYXRlZA%3D%3D",
    "https://www.youtube.com/watch?v=SrzOrek2PVg",
    "https://www.youtube.com/watch?v=wtw7m6C9HAw",
    "https://www.youtube.com/watch?v=mfQXeuUcJdU&pp=0gcJCfsJAYcqIYzv",
    "https://www.youtube.com/watch?v=s9CYeGi7ecs",
    "https://www.youtube.com/watch?v=HPX8EBCVm_s",
    "https://www.youtube.com/watch?v=vWCCDQgK06U&pp=0gcJCfsJAYcqIYzv",
    "https://www.youtube.com/watch?v=2qEe2REdhWw",
    "https://www.youtube.com/watch?v=3wwr5EzC0r4&pp=0gcJCfsJAYcqIYzv",
    "https://www.youtube.com/watch?v=Q4ez4-t3WhE",
    "https://www.youtube.com/watch?v=vWCCDQgK06U",
    "https://www.youtube.com/watch?v=AV2ZRjYKpSA",
    "https://www.youtube.com/watch?v=vfmohnIFMOQ",
    "https://www.youtube.com/watch?v=2p4V1ZCQneo&pp=0gcJCfsJAYcqIYzv",
    "https://www.youtube.com/watch?v=oNX7mhHewG0",
    "https://www.youtube.com/watch?v=_QXWa7QaEgk",
    "https://www.youtube.com/watch?v=xHdQYKq1LMs",
    "https://www.youtube.com/watch?v=K2OsUADOtLc",
    "https://www.youtube.com/watch?v=G4e9vrU1lrc",
    "https://www.youtube.com/watch?v=Zj0hcokB5Lg&pp=0gcJCfsJAYcqIYzv",
    "https://www.youtube.com/watch?v=uxXRtgQfEFI",
    "https://www.youtube.com/watch?v=M8c_NoP01_A",
    "https://www.youtube.com/watch?v=kD28gc6oqV4",
    "https://www.youtube.com/watch?v=0xUbMicNy-w",
    "https://www.youtube.com/watch?v=if2P7EPOgsY",
    ]

    # Path to your YouTube cookies file (optional, for restricted videos)
    cookie_file_path = 'www.youtube.com_cookies.txt'
    if not os.path.exists(cookie_file_path):
        print(f"‚ö†Ô∏è Cookie file not found at '{cookie_file_path}'. Restricted content may fail.")
        cookie_file_path = None

    print("\n--- Processing Video URLs ---")

    # New: Create a list to hold all individual video URLs
    all_individual_urls = []

    # New: yt-dlp options just for info extraction to find videos in playlists
    # 'noplaylist': False (default) is needed to process playlists.
    info_opts = {
        'extract_flat': 'in_playlist', # Get entries without full info
        'skip_download': True,
        'quiet': True,
        'no_warnings': True, # Suppress warnings (like SABR)
    }
    if cookie_file_path:
        info_opts['cookiefile'] = cookie_file_path

    print("Inspecting provided URLs for playlists...")
    with yt_dlp.YoutubeDL(info_opts) as ydl:
        for url in video_urls:
            print(f"Inspecting: {url}")
            try:
                # Extract info
                info = ydl.extract_info(url, download=False)

                # Check if it's a playlist
                if info.get('_type') == 'playlist':
                    print(f"  -> üîó Found playlist: {info.get('title', 'Unknown Playlist')}")
                    # Extract all video URLs from the playlist entries
                    playlist_video_urls = [entry.get('url') for entry in info.get('entries', []) if entry and entry.get('url')]
                    all_individual_urls.extend(playlist_video_urls)
                    print(f"  -> Added {len(playlist_video_urls)} videos from playlist.")
                else:
                    # It's a single video, add its original URL
                    print("  -> Single video found.")
                    all_individual_urls.append(url)

            except yt_dlp.utils.DownloadError as e:
                print(f"  -> ‚ùå Error inspecting URL {url}: {e}")
            except Exception as e:
                print(f"  -> ‚ùå Unexpected error inspecting URL {url}: {e}")

    print(f"\n--- Total individual videos to process: {len(all_individual_urls)} ---")

    # Now, process each individual URL
    for i, video_url in enumerate(all_individual_urls):
        download_video_and_extract_audio(
            video_url,
            output_dir=VIDEOS_DIRECTORY,
            audio_dir=AUDIO_DIRECTORY,
            metadata_file=METADATA_FILE,
            cookie_file=cookie_file_path
        )

        # Add a sleep timer after each download, except for the last one
        if i < len(all_individual_urls) - 1:
            print(f"\n--- üò¥ Sleeping for 1 seconds before next download ({i+2}/{len(all_individual_urls)}) ---")
            time.sleep(1)

    print("\nURL processing batch completed.")

    # Run verification for any videos that might have failed audio extraction
    verify_and_process_existing_videos(VIDEOS_DIRECTORY, AUDIO_DIRECTORY)

    print("\n--- All processing finished ---")





### Remove Videos and Audios more than a specific threshold like 20 minutes (1200 seconds)

In [None]:
import os
import pandas as pd
import sys

# --- Configuration ---
# Define the maximum allowed duration in seconds (e.g., 20 minutes = 20 * 60 = 1200)
MAX_DURATION_SECONDS = 1500

# Define paths (must match your main script)
VIDEOS_DIRECTORY = './videos'
AUDIO_DIRECTORY = './audio'
METADATA_FILE = os.path.join(VIDEOS_DIRECTORY, 'video_metadata.csv')
# --- End Configuration ---


def cleanup_long_videos(metadata_path, videos_dir, audio_dir, max_seconds):
    """
    Scans a metadata CSV and removes video/audio files that exceed a
    duration threshold. Updates the metadata file.
    """
    print("--- Video Cleanup Utility ---")

    if not os.path.exists(metadata_path):
        print(f"‚ùå Error: Metadata file not found at '{metadata_path}'. Cannot proceed.")
        return

    try:
        df = pd.read_csv(metadata_path)
    except pd.errors.EmptyDataError:
        print("‚úÖ Metadata file is empty. Nothing to clean up.")
        return
    except Exception as e:
        print(f"‚ùå Error reading metadata file: {e}")
        return

    # Ensure 'duration_seconds' column exists
    if 'duration_seconds' not in df.columns:
        print("‚ùå Error: 'duration_seconds' column not found in metadata.")
        return

    # Convert duration to numeric, handling errors (like 'N/A' or 'FAILED')
    # 'coerce' will turn non-numeric values into 'NaT' (Not a Time) / 'NaN' (Not a Number)
    df['duration_numeric'] = pd.to_numeric(df['duration_seconds'], errors='coerce')

    # Find videos to keep vs. videos to remove
    # Keep videos that are within the threshold (or have unknown duration)
    # We use .fillna(0) so that 'NaN' values (unknown duration) are kept
    to_keep_mask = df['duration_numeric'].fillna(0) <= max_seconds

    df_to_keep = df[to_keep_mask]
    df_to_remove = df[~to_keep_mask]

    if df_to_remove.empty:
        print(f"‚úÖ No videos found exceeding the {max_seconds}s threshold.")
        # Clean up the temporary column just in case
        if 'duration_numeric' in df.columns:
             df = df.drop(columns=['duration_numeric'])
             df.to_csv(metadata_path, index=False)
        return

    print(f"Found {len(df_to_remove)} video(s) to remove (duration > {max_seconds}s):")

    # New: List videos before asking for confirmation
    print("\n--- Videos to be removed ---")
    for index, row in df_to_remove.iterrows():
        title = row.get('title', f"URL: {row.get('url', 'N/A')}")
        duration = row.get('duration_seconds', 'N/A')
        print(f"  - {title} (Duration: {duration}s)")
    print("------------------------------")

    # New: Ask for confirmation here, inside the function
    confirm = input("\nAre you sure you want to proceed with deleting these files and entries? (yes/no): ")
    if confirm.lower() != 'yes':
        print("Operation cancelled by user.")
        return

    print("\nProceeding with deletion...")
    removed_count = 0
    for index, row in df_to_remove.iterrows():
        video_name = row.get('filename')
        audio_name = row.get('audio_filename')
        title = row.get('title', f"URL: {row.get('url', 'N/A')}")

        print(f"\nProcessing '{title}' (Duration: {row.get('duration_seconds')}s)")

        # 1. Remove Video File
        if pd.notna(video_name) and video_name not in ["FAILED", "SKIPPED_DURATION"]:
            video_path = os.path.join(videos_dir, video_name)
            if os.path.exists(video_path):
                try:
                    os.remove(video_path)
                    print(f"  üóëÔ∏è Removed video: {video_path}")
                    removed_count += 1
                except Exception as e:
                    print(f"  ‚ùå Error removing video {video_path}: {e}")
            else:
                print(f"  ü§∑ Video file not found: {video_path}")
        else:
            print(f"  ‚ÑπÔ∏è No valid video filename listed.")

        # 2. Remove Audio File
        if pd.notna(audio_name) and audio_name not in ["FAILED", "SKIPPED_DURATION"]:
            audio_path = os.path.join(audio_dir, audio_name)
            if os.path.exists(audio_path):
                try:
                    os.remove(audio_path)
                    print(f"  üóëÔ∏è Removed audio: {audio_path}")
                    removed_count += 1
                except Exception as e:
                    print(f"  ‚ùå Error removing audio {audio_path}: {e}")
            else:
                print(f"  ü§∑ Audio file not found: {audio_path}")
        else:
            print(f"  ‚ÑπÔ∏è No valid audio filename listed.")

    # 3. Update the metadata CSV file
    try:
        # Drop the temporary column before saving
        df_to_keep = df_to_keep.drop(columns=['duration_numeric'])
        df_to_keep.to_csv(metadata_path, index=False)
        print(f"\n‚úÖ Successfully updated metadata file: {metadata_path}")
        print(f"Removed {len(df_to_remove)} entries from CSV.")
    except Exception as e:
        print(f"‚ùå CRITICAL: Error writing updated metadata file: {e}")
        print("   Your files may be deleted, but the CSV was not updated.")

    print(f"\n--- Cleanup Summary ---")
    print(f"Removed {len(df_to_remove)} videos from metadata.")
    print(f"Deleted {removed_count} associated files.")
    print("--- Cleanup Finished ---")


if __name__ == '__main__':
    # Updated main block
    print(f"--- Video Cleanup Utility ---")
    print(f"This script will find files over {MAX_DURATION_SECONDS} seconds.")
    print(f"It will read from: {METADATA_FILE}")
    print(f"It will look for files in: {VIDEOS_DIRECTORY} and {AUDIO_DIRECTORY}")
    print("You will be asked for confirmation before any files are deleted.")

    # Check if a command-line argument is provided to auto-confirm
    if len(sys.argv) > 1 and sys.argv[1].lower() == '--yes':
        print("\n'--yes' flag detected, auto-confirming...")
        # This part is for automation, but the main logic is now inside the function
        # We'll just call the function, but the function itself will now ask.
        # Let's adjust the logic. The user *probably* wants --yes to bypass the *new* check.

        # Let's re-think the main block logic to better support --yes

        # We need to pass the confirmation status *into* the function.
        # I will refactor.
        pass # Will rewrite the main block and function slightly.


# --- Let's refactor the code to handle the confirmation logic better ---

def find_videos_to_remove(metadata_path, max_seconds):
    """Finds videos over the duration without deleting."""
    if not os.path.exists(metadata_path):
        print(f"‚ùå Error: Metadata file not found at '{metadata_path}'. Cannot proceed.")
        return None, None

    try:
        df = pd.read_csv(metadata_path)
    except pd.errors.EmptyDataError:
        print("‚úÖ Metadata file is empty. Nothing to clean up.")
        return None, None
    except Exception as e:
        print(f"‚ùå Error reading metadata file: {e}")
        return None, None

    if 'duration_seconds' not in df.columns:
        print("‚ùå Error: 'duration_seconds' column not found in metadata.")
        return None, None

    df['duration_numeric'] = pd.to_numeric(df['duration_seconds'], errors='coerce')
    to_keep_mask = df['duration_numeric'].fillna(0) <= max_seconds

    df_to_keep = df[to_keep_mask]
    df_to_remove = df[~to_keep_mask]

    return df_to_keep, df_to_remove


def delete_videos(df_to_remove, df_to_keep, metadata_path, videos_dir, audio_dir):
    """Performs the actual deletion of files and updates the CSV."""

    print("\nProceeding with deletion...")
    removed_count = 0

    for index, row in df_to_remove.iterrows():
        video_name = row.get('filename')
        audio_name = row.get('audio_filename')
        title = row.get('title', f"URL: {row.get('url', 'N/A')}")

        print(f"\nProcessing '{title}' (Duration: {row.get('duration_seconds')}s)")

        # 1. Remove Video File
        if pd.notna(video_name) and video_name not in ["FAILED", "SKIPPED_DURATION"]:
            video_path = os.path.join(videos_dir, video_name)
            if os.path.exists(video_path):
                try:
                    os.remove(video_path)
                    print(f"  üóëÔ∏è Removed video: {video_path}")
                    removed_count += 1
                except Exception as e:
                    print(f"  ‚ùå Error removing video {video_path}: {e}")
            else:
                print(f"  ü§∑ Video file not found: {video_path}")
        else:
            print(f"  ‚ÑπÔ∏è No valid video filename listed.")

        # 2. Remove Audio File
        if pd.notna(audio_name) and audio_name not in ["FAILED", "SKIPPED_DURATION"]:
            audio_path = os.path.join(audio_dir, audio_name)
            if os.path.exists(audio_path):
                try:
                    os.remove(audio_path)
                    print(f"  üóëÔ∏è Removed audio: {audio_path}")
                    removed_count += 1
                except Exception as e:
                    print(f"  ‚ùå Error removing audio {audio_path}: {e}")
            else:
                print(f"  ü§∑ Audio file not found: {audio_path}")
        else:
            print(f"  ‚ÑπÔ∏è No valid audio filename listed.")

    # 3. Update the metadata CSV file
    try:
        # Drop the temporary column before saving
        if 'duration_numeric' in df_to_keep.columns:
            df_to_keep = df_to_keep.drop(columns=['duration_numeric'])

        df_to_keep.to_csv(metadata_path, index=False)
        print(f"\n‚úÖ Successfully updated metadata file: {metadata_path}")
        print(f"Removed {len(df_to_remove)} entries from CSV.")
    except Exception as e:
        print(f"‚ùå CRITICAL: Error writing updated metadata file: {e}")
        print("   Your files may be deleted, but the CSV was not updated.")

    print(f"\n--- Cleanup Summary ---")
    print(f"Removed {len(df_to_remove)} videos from metadata.")
    print(f"Deleted {removed_count} associated files.")
    print("--- Cleanup Finished ---")


if __name__ == '__main__':
    print(f"--- Video Cleanup Utility ---")
    print(f"This script will find files over {MAX_DURATION_SECONDS} seconds.")
    print(f"It will read from: {METADATA_FILE}")
    print(f"It will look for files in: {VIDEOS_DIRECTORY} and {AUDIO_DIRECTORY}")

    # 1. Find videos
    df_to_keep, df_to_remove = find_videos_to_remove(METADATA_FILE, MAX_DURATION_SECONDS)

    # 2. Check results
    if df_to_remove is None or df_to_remove.empty:
        if df_to_remove is not None: # This means it was empty, not an error
             print(f"‚úÖ No videos found exceeding the {MAX_DURATION_SECONDS}s threshold.")
             # We might need to save the DF to remove the temp column
             if df_to_keep is not None and 'duration_numeric' in df_to_keep.columns:
                 df_to_keep = df_to_keep.drop(columns=['duration_numeric'])
                 df_to_keep.to_csv(METADATA_FILE, index=False)
        sys.exit() # Exit script

    # 3. List videos
    print(f"\nFound {len(df_to_remove)} video(s) to remove (duration > {MAX_DURATION_SECONDS}s):")
    print("------------------------------")
    for index, row in df_to_remove.iterrows():
        title = row.get('title', f"URL: {row.get('url', 'N/A')}")
        duration = row.get('duration_seconds', 'N/A')
        print(f"  - {title} (Duration: {duration}s)")
    print("------------------------------")

    # 4. Check for auto-confirmation or ask user
    auto_confirm = len(sys.argv) > 1 and sys.argv[1].lower() == '--yes'

    if auto_confirm:
        print("\n'--yes' flag detected, proceeding with deletion...")
        proceed = True
    else:
        confirm = input("\nAre you sure you want to proceed with deleting these files and entries? (yes/no): ")
        proceed = confirm.lower() == 'yes'

    # 5. Execute deletion if confirmed
    if proceed:
        delete_videos(df_to_remove, df_to_keep, METADATA_FILE, VIDEOS_DIRECTORY, AUDIO_DIRECTORY)
    else:
        print("Operation cancelled by user.")

In [None]:
import torch

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("CUDA memory cleared.")
else:
    print("CUDA not available.")

In [None]:
!pip install -q git+https://github.com/openai/whisper.git

In [None]:
# pip install git+https://github.com/openai/whisper.git

import os
import whisper
import torch
import json
from tqdm import tqdm

def transcribe_audio_files(input_dir: str = './audio', output_dir: str = './transcripts'):
    """
    Transcribes all .wav files in the input directory using Whisper's large-v3 model,
    capturing sentence-level timestamps, and saves the output as .json files.

      Args:
        input_dir: The directory containing the .wav files (16kHz mono).
        output_dir: The directory where the transcription .json files will be saved.
    """
    print("--- Starting Audio Transcription Process ---")

    # 1. Setup directories and check for GPU
    os.makedirs(output_dir, exist_ok=True)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device.upper()}")
    if device == 'cpu':
        print("‚ö†Ô∏è WARNING: No GPU found. Transcription will be very slow.")

    # 2. Load the pre-trained Whisper model
    print("Loading Whisper model (large)...")
    try:
        model = whisper.load_model("large", device=device)
        print("‚úÖ Model loaded successfully.")
    except Exception as e:
        print(f"‚ùå Error loading Whisper model: {e}")
        print("Please check your internet connection and if 'openai-whisper' is installed correctly.")
        return

    # 3. Identify audio files to process
    audio_files = {os.path.splitext(f)[0] for f in os.listdir(input_dir) if f.endswith('.wav')}
    transcribed_files = {os.path.splitext(f)[0] for f in os.listdir(output_dir) if f.endswith('.json')}
    files_to_process = sorted([f + '.wav' for f in (audio_files - transcribed_files)])

    if not files_to_process:
        print("‚úÖ All audio files have already been transcribed.")
        return

    print(f"Found {len(files_to_process)} audio file(s) to transcribe.")

    # 4. Process each audio file
    for filename in tqdm(files_to_process, desc="Transcribing Audio"):
        input_path = os.path.join(input_dir, filename)
        output_filename = f"{os.path.splitext(filename)[0]}.json"
        output_path = os.path.join(output_dir, output_filename)

        try:
            # Perform transcription (sentence/segment-level timestamps by default)
            result = model.transcribe(input_path, fp16=torch.cuda.is_available())

            # Extract sentence-level segments and round timestamps to one decimal place
            segments = [
                {
                    "start": round(seg["start"], 1),
                    "end": round(seg["end"], 1),
                    "text": seg["text"].strip()
                }
                for seg in result["segments"]
            ]

            # Save just the clean sentence-level transcript with timestamps
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(segments, f, indent=4, ensure_ascii=False)

        except Exception as e:
            tqdm.write(f"‚ùå Error transcribing {filename}: {e}")

    print("\n--- Audio Transcription process completed. ---")


if __name__ == '__main__':
    # Make sure ffmpeg is installed on your system and accessible in your PATH.
    # On Debian /Ubuntu: sudo apt update && sudo apt install ffmpeg
    # On macOS (using Homebrew): brew install ffmpeg
    # On Windows (using Chocolatey): choco install ffmpeg
    transcribe_audio_files()

In [None]:
!pip -q install cerebras-cloud-sdk

In [None]:
"""
Transcript Refinement Pipeline (to TXT) ‚Äì FINAL CLEAN VERSION
No cv2 ‚Üí No RAM crash | No duration ‚Üí Faster & lighter | Video matching preserved
"""

import os
import json
import time
from tqdm import tqdm
import csv

# --- Colab or Local ---
try:
    from google.colab import userdata
    COLAB_ENV = True
except ImportError:
    from dotenv import load_dotenv
    COLAB_ENV = False

from cerebras.cloud.sdk import Cerebras


# --- Configuration ---
INPUT_DIR = './transcripts'
VIDEOS_DIR = './videos'
REFINED_OUTPUT_DIR = './refined_transcripts'
FULL_RESPONSE_DIR = os.path.join(REFINED_OUTPUT_DIR, 'full_responses')
CSV_LOG_PATH = os.path.join(REFINED_OUTPUT_DIR, 'refinement_log.csv')

API_CALL_DELAY_SECONDS = 20
MAX_FILES_PER_RUN = 300

# MODEL_NAME = "qwen-3-235b-a22b-thinking-2507"
MODEL_NAME = "qwen-3-235b-a22b-instruct-2507"

MEDICAL_EDITOR_SYSTEM_PROMPT = """You are an expert JSON and medical editor. Your task is to correct typos, punctuation, and grammatical errors in a JSON file provided by the user, while preserving its exact structure.

The user will provide a JSON array of segments from a cataract surgery video.
Your job is to fix errors **only** in the "text" fields.

**CRITICAL INSTRUCTIONS:**
1.  Read the user's JSON, perform your corrections, and think.
2.  You **MUST** return the JSON in the **EXACT** same array format, including "start", "end", and "text" keys for every segment.
3.  **DO NOT** alter the "start", "end", or any other part of the JSON structure.
4.  **DO NOT** include any commentary, conversational replies, or pre-amble.
5.  The output must be the pure, corrected JSON data and nothing else."""


# ==============================================================================
# Helper Functions
# ==============================================================================

def get_api_key():
    if COLAB_ENV:
        return userdata.get('CEREBRAS_API_KEY')
    else:
        return os.environ.get("CEREBRAS_API_KEY")

def parse_llm_json_output(raw_output: str) -> str:
    """
    Extracts JSON array from a model output.
    Works for:
      ‚Ä¢ Thinking models ‚Üí have <think> ... </think> tags
      ‚Ä¢ Instruct models ‚Üí no think tag at all

    Always returns ONLY the JSON array [ ... ].
    """

    THINK_END_TAG = "</think>"

    # --- Case 1: Thinking model output ---
    if THINK_END_TAG in raw_output:
        after_think = raw_output.split(THINK_END_TAG, 1)[1].strip()
    else:
        # --- Case 2: Instruct model output (no think tag) ---
        after_think = raw_output.strip()

    # Extract the JSON array between the first `[` and last `]`
    start = after_think.find("[")
    end = after_think.rfind("]")

    if start != -1 and end != -1 and end > start:
        return after_think[start:end + 1].strip()

    # If for some reason JSON is malformed, return everything
    return after_think


def format_time_to_mm_ss(seconds: float) -> str:
    s = int(seconds)
    return f"{s//60:02d}:{s%60:02d}"

def format_segments_to_txt(segments: list) -> str:
    lines = []
    for seg in segments:
        try:
            start = format_time_to_mm_ss(seg["start"])
            end = format_time_to_mm_ss(seg["end"])
            text = seg["text"].strip()
            lines.append(f"[{start} - {end}]: {text}")
        except KeyError:
            continue
    return "\n".join(lines)

def find_matching_video(videos_dir: str, basename: str) -> tuple[str, bool]:
    """Returns (video_filename_or_NOT_FOUND, found_bool)"""
    if not os.path.isdir(videos_dir):
        return "NOT_FOUND", False

    target = basename.lower()
    for filename in os.listdir(videos_dir):
        name, ext = os.path.splitext(filename)
        if name.lower() == target and ext.lower() in {'.mp4','.mov','.avi','.mkv','.webm','.flv','.m4v','.wmv','.mpeg','.mpg'}:
            return filename, True
    return "NOT_FOUND", False

def refine_with_llm(text: str, model_name: str, api_key: str) -> str | None:
    try:
        client = Cerebras(api_key=api_key)
        messages = [
            {"role": "system", "content": MEDICAL_EDITOR_SYSTEM_PROMPT},
            {"role": "user", "content": text}
        ]
        response = client.chat.completions.create(messages=messages, model=model_name)
        return response.choices[0].message.content
    except Exception as e:
        if any(x in str(e).lower() for x in ["context", "limit", "too large"]):
            tqdm.write(f"Context limit exceeded: {e}")
            return '{"error": "MODEL CONTEXT LIMIT EXCEEDED"}'
        tqdm.write(f"API error: {e} ‚Üí retrying in 10s...")
        time.sleep(10)
        return None


# ==============================================================================
# CSV Logging
# ==============================================================================

def setup_csv_log():
    if os.path.exists(CSV_LOG_PATH):
        return
    with open(CSV_LOG_PATH, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([
            "video_name",
            "transcript_name",
            "total_characters",
            "total_words",
            "video_found",
            "success"
        ])

def log_result(row: list):
    with open(CSV_LOG_PATH, 'a', newline='', encoding='utf-8') as f:
        csv.writer(f).writerow(row)

# ==============================================================================
# Main Pipeline
# ==============================================================================

def refine_transcripts():
    print("\n=== TRANSCRIPT REFINEMENT PIPELINE (LIGHT & STABLE) ===\n")

    os.makedirs(REFINED_OUTPUT_DIR, exist_ok=True)
    os.makedirs(FULL_RESPONSE_DIR, exist_ok=True)
    setup_csv_log()

    api_key = get_api_key()
    if not api_key:
        print("CEREBRAS_API_KEY not found!")
        return

    # Find files to process
    try:
        raw = {os.path.splitext(f)[0] for f in os.listdir(INPUT_DIR) if f.endswith('.json')}
        done = {os.path.splitext(f)[0] for f in os.listdir(REFINED_OUTPUT_DIR) if f.endswith('.txt')}
    except FileNotFoundError as e:
        print(f"Directory not found: {e}")
        return

    to_process = sorted(raw - done)[:MAX_FILES_PER_RUN]
    if not to_process:
        print("All transcripts already refined!")
        return

    print(f"Processing {len(to_process)} transcript(s)...\n")

    for basename in tqdm(to_process, desc="Refining"):
        video_name, video_found = find_matching_video(VIDEOS_DIR, basename)
        if video_found:
            tqdm.write(f"  Found video: {video_name}")
        else:
            tqdm.write(f"  No video found for: {basename}")

        json_path = os.path.join(INPUT_DIR, f"{basename}.json")
        txt_path = os.path.join(REFINED_OUTPUT_DIR, f"{basename}.txt")
        full_path = os.path.join(FULL_RESPONSE_DIR, f"{basename}_full_response.txt")

        # Load original
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                segments = json.load(f)
        except Exception as e:
            tqdm.write(f"Failed to load {basename}.json ‚Üí {e}")
            continue

        if not segments:
            tqdm.write(f"Empty transcript: {basename}")
            continue

        payload = json.dumps(segments, indent=4)

        # Call LLM
        raw_response = None
        while raw_response is None:
            raw_response = refine_with_llm(payload, MODEL_NAME, api_key)

        # Save raw response
        with open(full_path, 'w', encoding='utf-8') as f:
            f.write(raw_response)

        # Parse and save refined .txt
        success = False
        total_chars = total_words = 0
        try:
            cleaned_json = parse_llm_json_output(raw_response)
            refined_segments = json.loads(cleaned_json)

            total_words = sum(len(seg.get("text", "").split()) for seg in refined_segments)
            formatted_text = format_segments_to_txt(refined_segments)
            total_chars = len(formatted_text)

            with open(txt_path, 'w', encoding='utf-8') as f:
                f.write(formatted_text)

            success = True
        except Exception as e:
            tqdm.write(f"Failed to parse/save {basename} ‚Üí {e}")

        # Log result
        log_result([
            video_name,
            f"{basename}.txt",
            total_chars,
            total_words,
            video_found,
            success
        ])

        # Rate limit
        if basename != to_process[-1]:
            time.sleep(API_CALL_DELAY_SECONDS)

    print("\nALL DONE ‚Äì NO MORE COLAB CRASHES!")


# ==============================================================================
# Run
# ==============================================================================

if __name__ == '__main__':
    if not COLAB_ENV:
        load_dotenv()
    refine_transcripts()

In [None]:
import os

videos_list = os.listdir("./videos")
print(f"Number of Videos {len(videos_list)}")
print(f"Number of Audios {len(os.listdir('./audio'))}")
print(f"Number of transcripts {len(os.listdir('./transcripts'))}")
print(f"Number of refined transcripts: {len(os.listdir('./refined_transcripts/'))}")