# Semantic Chunking

In [None]:
from pytube import YouTube # Video Download
from moviepy.editor import AudioFileClip  # For audio extraction
import whisper # For transcribing audio


# Downloading the YouTube Video and Extracting Audio
def download_and_extract_audio(youtube_url, output_path="Extracted_Audio.mp3"):
    yt = YouTube(youtube_url)
    video_stream = yt.streams.filter(only_audio=True).first()
    video_stream.download(filename="YouTube_Video.mp4")

    audio_clip = AudioFileClip("YouTube_Video.mp4")
    audio_clip.write_audiofile(output_path)
    audio_clip.close()

download_and_extract_audio("#LINK")

# Loading Whisper Model
model = whisper.load_model("base")

# Transcribing Audio
result = model.transcribe("Extracted_Audio.mp3")

transcription = result['text']
segments = result['segments']


# Time Align Transcript with Audio & Semantic Chunking


def create_semantic_chunks(segments, max_chunk_length=15):
    chunks = []
    chunk_id = 0
    current_chunk = {"text": "", "start_time": None, "end_time": None}

    # Creating semantic chunks of the transcription based on the specified maximum chunk length.

    for segment in segments:
        start_time = segment['start']
        end_time = segment['end']
        text = segment['text']

        if current_chunk["start_time"] is None:
            current_chunk["start_time"] = start_time

        if current_chunk["end_time"] is None or (end_time - current_chunk["start_time"]) <= max_chunk_length:
            current_chunk["text"] += " " + text
            current_chunk["end_time"] = end_time
        else:
            chunk_id += 1
            chunks.append({
                "chunk_id": chunk_id,
                "chunk_length": current_chunk["end_time"] - current_chunk["start_time"],
                "text": current_chunk["text"].strip(),
                "start_time": current_chunk["start_time"],
                "end_time": current_chunk["end_time"]
            })
            current_chunk = {"text": text, "start_time": start_time, "end_time": end_time}

    if current_chunk["text"]:
        chunk_id += 1
        chunks.append({
            "chunk_id": chunk_id,
            "chunk_length": current_chunk["end_time"] - current_chunk["start_time"],
            "text": current_chunk["text"].strip(),
            "start_time": current_chunk["start_time"],
            "end_time": current_chunk["end_time"]
        })

    return chunks

chunks = create_semantic_chunks(segments)

chunks_output = [
    {
        "chunk_id": chunk["chunk_id"],
        "chunk_length": chunk["chunk_length"],
        "text": chunk["text"],
        "start_time": chunk["start_time"],
        "end_time": chunk["end_time"]
    }
    for chunk in chunks
]

chunks_output

# Gradio

In [None]:
import gradio as gr

def process_video(youtube_url):
    download_and_extract_audio(youtube_url)
    result = model.transcribe("Extracted_Audio.mp3")
    segments = result['segments']
    chunks = create_semantic_chunks(segments)

    output_list = [
        {
            "chunk_id": chunk["chunk_id"],
            "chunk_length": chunk["chunk_length"],
            "text": chunk["text"],
            "start_time": chunk["start_time"],
            "end_time": chunk["end_time"]
        }
        for chunk in chunks
    ]
    return output_list

# Gradio Interface

iface = gr.Interface(fn=process_video, inputs="text", outputs="json")
iface.launch(debug=True)