# Step 1: Download YouTube Video

In [2]:

# youtube_url = "https://www.youtube.com/watch?v=nykOeWgQcHM"
import argparse
import os
from subprocess import call
from yt_dlp import YoutubeDL

# Check if running in a Jupyter Notebook environment
def in_notebook():
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        return False      # Probably standard Python interpreter

# Set up argument parser
parser = argparse.ArgumentParser()
parser.add_argument('-url', '--url', help='YouTube video URL', required=not in_notebook())
args = parser.parse_args([] if in_notebook() else None)

# Set VIDEO_URL based on the environment
VIDEO_URL = args.url if not in_notebook() else "https://www.youtube.com/watch?v=nykOeWgQcHM"

# Download the YouTube video as an MP4
def download_video(url):
    ydl_opts = {
        'format': 'best[ext=mp4]',  # Ensures MP4 format
        'outtmpl': 'downloaded_video.mp4',  # Save as downloaded_video.mp4
    }
    with YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

# Download the video
download_video(VIDEO_URL)

# Set paths for the downloaded video
video_path = 'downloaded_video.mp4'
video_root = os.path.dirname(video_path)
video_name = os.path.basename(video_path)
video_path_wo_suffix = os.path.splitext(video_name)[0]

Deprecated Feature: Support for Python version 3.8 has been deprecated. Please update to Python 3.9 or above


[youtube] Extracting URL: https://www.youtube.com/watch?v=nykOeWgQcHM
[youtube] nykOeWgQcHM: Downloading webpage
[youtube] nykOeWgQcHM: Downloading ios player API JSON
[youtube] nykOeWgQcHM: Downloading mweb player API JSON
[youtube] nykOeWgQcHM: Downloading player dad5a960
[youtube] nykOeWgQcHM: Downloading m3u8 information
[info] nykOeWgQcHM: Downloading 1 format(s): 18
[download] Destination: downloaded_video.mp4
[download] 100% of   84.87MiB in 00:00:18 at 4.48MiB/s     


## Extract audio from the downloaded video

In [26]:
# pip! brew install ffmpeg

# # Extract audio from the downloaded video
# def extract_audio(video_path):
#     audio_path = 'extracted_audio.mp3'
#     call(['ffmpeg', '-i', video_path, '-q:a', '0', '-map', 'a', audio_path])
#     return audio_path

# Extract audio from the downloaded video as WAV
def extract_audio(video_path):
    audio_path = 'extracted_audio.wav'
    try:
        call(['ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '2', audio_path])
    except CalledProcessError as e:
        print(f"Error during audio extraction: {e}")
        return None
    return audio_path

# Extract audio from the downloaded video
audio_path = extract_audio(video_path)

ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.4)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1_3 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e

In [8]:
audio_path

'extracted_audio.mp3'

# Perform scene detection and split the video into scenes

In [None]:
import cv2
from scenedetect import detect, ContentDetector
from scenedetect.video_manager import VideoManager
from scenedetect.scene_manager import SceneManager
from scenedetect.stats_manager import StatsManager
import os
import json

def format_timedelta(seconds):
    hours, remainder = divmod(int(seconds), 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d}"

# Input video file
video_path = "downloaded_video.mp4"

# Output directory for scenes
output_dir = "scenes"
os.makedirs(output_dir, exist_ok=True)

# Perform scene detection
video_manager = VideoManager([video_path])
stats_manager = StatsManager()
scene_manager = SceneManager(stats_manager)

scene_manager.add_detector(ContentDetector())
video_manager.set_downscale_factor()

video_manager.start()
scene_manager.detect_scenes(frame_source=video_manager)

scenes = scene_manager.get_scene_list()

# Extract frames and save timestamps
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)

timestamps = {}

for i, scene in enumerate(scenes):
    
    scene_start_frame = scene[0].get_frames()
    cap.set(cv2.CAP_PROP_POS_FRAMES, scene_start_frame)
    ret, frame = cap.read()
    
    if ret:
        scene_filename = f"scene_{i:03d}.png"
        cv2.imwrite(os.path.join(output_dir, scene_filename), frame)
        
        timestamp = scene_start_frame / fps
        timestamps[scene_filename] = format_timedelta(timestamp)

cap.release()

# Save timestamps to JSON file
with open(os.path.join(output_dir, "timestamps.json"), "w") as f:
    json.dump(timestamps, f, indent=4)

print(f"Scene detection complete. Selected scenes saved in '{output_dir}' folder.")
print(f"Timestamps saved in '{output_dir}/timestamps.json'.")

VideoManager is deprecated and will be removed.


Scene detection complete. Selected scenes saved in 'scenes' folder.
Timestamps saved in 'scenes/timestamps.json'.


In [1]:
import os
import cv2
import torch


In [5]:
# Set paths
input_folder = 'scenes'
output_folder = 'filtered_scenes'


# Ensure the output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Load YOLOv5 model (pre-trained)
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')  # You can use 'yolov5m', 'yolov5l', etc. for larger models

# Function to check if an image contains a person using YOLOv5
def contains_person(image_path):
    # Read the image
    img = cv2.imread(image_path)
    # Convert BGR (OpenCV format) to RGB (YOLOv5 input format)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Inference with YOLOv5
    results = model(img_rgb)  # Make prediction
    labels = results.names  # Class labels
    
    # Check if "person" (class 0) is detected
    person_detected = any(label == 'person' for label in results.names)
    return person_detected

# Process each image in the input folder
for filename in os.listdir(input_folder):
    image_path = os.path.join(input_folder, filename)
    if os.path.isfile(image_path) and image_path.lower().endswith(('.png', '.jpg', '.jpeg')):
        print(f"Processing {filename}...")
        
        # Check if the image contains a person
        if contains_person(image_path):
            print(f"Person detected in {filename}, deleting...")
            os.remove(image_path)  # Delete image with person
        else:
            # Move the image to the filtered folder if no person detected
            output_path = os.path.join(output_folder, filename)
            os.rename(image_path, output_path)

print("Processing complete!")



Using cache found in /Users/syedalihaider/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-11-20 Python-3.8.8 torch-2.2.2 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


Processing scene_039.png...
Processing scene_011.png...
Processing scene_005.png...
Processing scene_004.png...
Processing scene_010.png...
Processing scene_038.png...
Processing scene_006.png...
Processing scene_012.png...
Processing scene_013.png...
Processing scene_007.png...
Processing scene_003.png...
Processing scene_017.png...
Processing scene_016.png...
Processing scene_002.png...
Processing scene_014.png...
Processing scene_000.png...
Processing scene_028.png...
Processing scene_029.png...
Processing scene_001.png...
Processing scene_015.png...
Processing scene_106.png...
Processing scene_112.png...
Processing scene_099.png...
Processing scene_072.png...
Processing scene_066.png...
Processing scene_067.png...
Processing scene_073.png...
Processing scene_098.png...
Processing scene_113.png...
Processing scene_107.png...
Processing scene_111.png...
Processing scene_105.png...
Processing scene_059.png...
Processing scene_065.png...
Processing scene_071.png...
Processing scene_070

# Perform ORC

In [None]:
import os
import cv2
import pytesseract
import json
import re
from datetime import timedelta

def load_timestamps(json_path):
    """
    Load timestamps from a JSON file.
    
    Args:
        json_path (str): Path to the JSON file containing timestamps.
        
    Returns:
        dict: A dictionary mapping filenames to timestamps.
    """
    try:
        with open(json_path, 'r') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading timestamps from {json_path}: {e}")
        return {}

def perform_ocr_with_preprocessing(image_dir, output_json_path, timestamps, max_text_length=500, language='eng'):
    """
    Performs OCR on images in the specified directory with preprocessing, extracts text with truncation, 
    and saves results to a JSON file with timestamps from the provided JSON file.
    
    Args:
        image_dir (str): Path to the directory containing images.
        output_json_path (str): Path to save the output JSON file.
        timestamps (dict): Dictionary mapping filenames to timestamps.
        max_text_length (int): Maximum number of characters for each OCR text output.
        language (str): Language code for OCR, e.g., 'eng' for English.
        
    Returns:
        dict: A dictionary containing OCR results with timestamps.
    """
    ocr_results = []

    for image_file in os.listdir(image_dir):
        if image_file.endswith(".png"):
            image_path = os.path.join(image_dir, image_file)
            
            # Get timestamp from the timestamps dictionary
            timestamp = timestamps.get(image_file, "00:00:00")

            try:
                # Load image
                image = cv2.imread(image_path)
                
                # Preprocess image
                gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # Convert to grayscale
                _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)  # Apply binary thresholding

                # Perform OCR on the preprocessed image
                text = pytesseract.image_to_string(thresh, lang=language).strip()
                
                # Truncate text if it exceeds max_text_length
                if len(text) > max_text_length:
                    text = text[:max_text_length] + "..."  # Add ellipsis to indicate truncation
                
                # Append result for this image with timestamp
                ocr_results.append({
                    "timestamp": timestamp,
                    "file_name": image_file,
                    "text": text
                })
                
            except Exception as e:
                print(f"Error processing {image_file}: {e}")
    
    # Save OCR results to JSON
    try:
        with open(output_json_path, "w", encoding="utf-8") as json_file:
            json.dump(ocr_results, json_file, ensure_ascii=False, indent=4)
        print(f"OCR results saved to {output_json_path}")
    except Exception as json_error:
        print(f"Error saving JSON file: {json_error}")
    
    return ocr_results

# Main execution
image_dir = 'scenes'  # Path to directory with images
timestamps_json_path = os.path.join(image_dir, 'timestamps.json')  # Path to JSON file with timestamps
output_json_path = 'ocr_results_with_timestamps_newest.json'  # Path to save JSON output
max_text_length = 50000  # Maximum number of characters per text segment

# Load timestamps from JSON file
timestamps = load_timestamps(timestamps_json_path)

ocr_results = perform_ocr_with_preprocessing(image_dir, output_json_path, timestamps, max_text_length, language='eng')


OCR results saved to ocr_results_with_timestamps_newest.json


# Transcribe audio using Whisper

In [28]:
# Transcribe audio using Whisper

import whisper
def transcribe_audio(audio_path):
    try:
        # Load the pre-trained Whisper model
        model = whisper.load_model("base")  # or choose a different model size
        
        # Transcribe audio
        result = model.transcribe(audio_path)
        return result['text']
    
    except ImportError:
        raise ImportError("Please install the whisper library: pip install openai-whisper")
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return None

# Replace 'audio_path' with the path to your audio file
transcription = transcribe_audio(audio_path)
print("\nTranscription:")
print(transcription)


100%|███████████████████████████████████████| 139M/139M [00:06<00:00, 21.0MiB/s]



Transcription:
 The following content is provided under a Creative Commons license. Your support will help MIT OpenCourseWare continue to offer high quality educational resources for free. To make a donation or view additional materials from hundreds of MIT courses, visit MIT OpenCourseWare at ocw.mit.edu. All right, let's begin. So as I mentioned before, this lecture will be recorded for ocw. So again, in future lectures, if you don't want to have the back of your head show up, just don't sit in this front area here. So first of all, while water crowd you guys were finally in 26-100, 64-pollon made it big, huh? So good afternoon and welcome to the very first class of 64-pollon and also 600, the semester. So my name is Anna Bell, first name Anna, last name Bell. I'm a lecturer in the EES department. And I'll be giving some of the lectures for today, along with later on in the term Professor Eric Crimson, who's sitting right down there, we'll be giving some of the lectures as well. Oka

In [None]:
import whisper
import json
from pydub import AudioSegment

# Load Whisper model once to use for all chunks
model = whisper.load_model("base")

def split_audio(audio_path, chunk_length_ms=60000):
    audio = AudioSegment.from_file(audio_path)
    chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
    return chunks

def transcribe_audio_segment(audio_segment, model, segment_index):
    # Export chunk to temp file
    temp_path = f"temp_chunk_{segment_index}.wav"
    audio_segment.export(temp_path, format="wav")
    
    # Transcribe chunk
    result = model.transcribe(temp_path, word_timestamps=True)
    
    # Extract transcription data, handling cases where 'text' or 'segments' might be missing
    transcription_data = {
        "segment_index": segment_index,
        "text": result.get("text", ""),  # Use an empty string if 'text' is missing
        "words": []
    }
    
    # Add word timestamps if available
    if "segments" in result:
        for segment in result["segments"]:
            if "words" in segment:
                for word in segment["words"]:
                    # Safely get 'text', 'start', and 'end' keys with defaults
                    word_text = word.get("text", None)
                    start_time = word.get("start", 0)
                    end_time = word.get("end", 0)
                    
                    # Only add word data if 'text' exists
                    if word_text is not None:
                        transcription_data["words"].append({
                            "word": word_text,
                            "start_time": start_time,
                            "end_time": end_time
                        })
    
    return transcription_data

def transcribe_long_audio_with_segments(audio_path, output_json_path, chunk_length_ms=60000):
    # Split audio into chunks
    audio_chunks = split_audio(audio_path, chunk_length_ms)
    
    # Transcribe each chunk and collect the results
    all_transcriptions = []
    for i, chunk in enumerate(audio_chunks):
        print(f"Transcribing segment {i + 1}/{len(audio_chunks)}...")
        segment_transcription = transcribe_audio_segment(chunk, model, i)
        all_transcriptions.append(segment_transcription)
    
    # Save all transcriptions to JSON
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(all_transcriptions, f, indent=4)
    
    print("Transcription saved to:", output_json_path)

# Example usage
audio_path = "extracted_audio.wav"  # path to your audio file
output_json_path = "transcription_with_segments.json"  # path to save JSON output
transcribe_long_audio_with_segments(audio_path, output_json_path)


# Step 6: Encoding Slide Text, Images, and Audio Text for Vector Database

In [None]:
# Step 6: Encoding Slide Text, Images, and Audio Text for Vector Database

# Text Encoding
from sentence_transformers import SentenceTransformer

text_model = SentenceTransformer('all-MiniLM-L6-v2')
slide_text_embeddings = [text_model.encode(text) for text in slide_texts]
print("Slide text encoding completed.")

# Image Encoding (using CLIP)
import clip
import torch
from PIL import Image

clip_model, preprocess = clip.load("ViT-B/32", device="cpu")
slide_image_embeddings = []
for image_path in slide_images:
    image = preprocess(Image.open(image_path)).unsqueeze(0)
    with torch.no_grad():
        image_embedding = clip_model.encode_image(image)
    slide_image_embeddings.append(image_embedding)
print("Slide image encoding completed.")

# Audio Transcription Encoding (using same text model)
audio_text_embedding = text_model.encode(transcription)
print("Audio text encoding completed.")


# Step 7: Storing Embeddings in a Vector Database (ChromaDB)

In [None]:
# Step 7: Storing Embeddings in a Vector Database using CLIP
