<a href="https://colab.research.google.com/github/s11khushboo/youtube-QandA/blob/main/preprocessing-video.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install yt-dlp openai-whisper sentence-transformers pinecone



Collecting pinecone
  Downloading pinecone-8.0.0-py3-none-any.whl.metadata (11 kB)
Collecting pinecone-plugin-assistant<4.0.0,>=3.0.1 (from pinecone)
  Downloading pinecone_plugin_assistant-3.0.1-py3-none-any.whl.metadata (30 kB)
Collecting packaging>=20.9 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pinecone-8.0.0-py3-none-any.whl (745 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m745.9/745.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_assistant-3.0.1-py3-none-any.whl (280 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.9/280.9 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading packaging-24.2-py3-none-any.whl (65 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: packaging, pinecone-plugin-assistant, pine

In [4]:
# ingest.py (simplified)
from yt_dlp import YoutubeDL
import whisper
from sentence_transformers import SentenceTransformer
import pinecone
import uuid
import math
import time

INDEX_NAME = "youtube-chunks"
EMBED_MODEL = "all-MiniLM-L6-v2"  # or OpenAI embeddings
WHISPER_MODEL = "small"
# download audio
def download_audio(youtube_url, out_path="audio.mp3"):
    ydl_opts = {"format": "bestaudio/best", "outtmpl": out_path}
    with YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])
    return out_path

In [2]:
# transcribe
def transcribe_whisper(audio_path):
    model = whisper.load_model(WHISPER_MODEL)
    result = model.transcribe(audio_path, task="transcribe")  # returns segments with timestamps
    return result  # {"text": "...", "segments": [{start,end,text}], ...}

In [9]:
def ingest_youtube_video(url):
    print("Downloading audio...")
    audio_path = download_audio(url)

    return audio_path
print(ingest_youtube_video("https://youtu.be/dwlE7TiDXz4?si=SvVKbWlBuInfYECa"))

Downloading audio...
[youtube] Extracting URL: https://youtu.be/dwlE7TiDXz4?si=SvVKbWlBuInfYECa
[youtube] dwlE7TiDXz4: Downloading webpage




[youtube] dwlE7TiDXz4: Downloading android sdkless player API JSON
[youtube] dwlE7TiDXz4: Downloading web safari player API JSON




[youtube] dwlE7TiDXz4: Downloading m3u8 information




[info] dwlE7TiDXz4: Downloading 1 format(s): 251-12
[download] audio.mp3 has already been downloaded
[download] 100% of    7.44MiB
audio.mp3


In [10]:
print("Transcribing audio...")
transcript = transcribe_whisper("/content/audio.mp3")
print(transcript)

Transcribing audio...


100%|███████████████████████████████████████| 461M/461M [00:06<00:00, 77.8MiB/s]


{'text': " MCP vs API. Will MCP replace API? MCP flips this completely. AI, models and AI agents. APIs are like a restaurant menu. No pre-written code, no manual integration. With APIs, discovery is static. If you asked an AI agent to order you a pizza, book a doctor's appointment, and send an email to your boss, all in one go, could it actually do that? The answer is yes, but not with traditional APIs. It's with MCP, Model Context Protocol. If you're new here, I am Priyanka and on this channel, we break down cloud and AI technologies for developers and tech practitioners who want to stay ahead of the curve. Now today, we're diving into MCP vs API, a debate that is fundamentally changing how you think about AI, agent and development. Let's start with one sentence difference between MCP and APIs. Now APIs are built for human developers to manually integrate software systems. While MCP is specifically designed for AI, models and AI agents to dynamically and autonomously interact with sof

In [11]:

# chunking with overlap
def chunk_segments(segments, max_chars=1000, overlap_chars=200):
    chunks = []
    buffer = ""
    buffer_start = None
    buffer_end = None
    for seg in segments:
        text = seg["text"].strip()
        if not buffer:
            buffer_start = seg["start"]
        if len(buffer) + len(text) <= max_chars:
            buffer += (" " + text)
            buffer_end = seg["end"]
        else:
            chunks.append({
                "start": buffer_start, "end": buffer_end, "text": buffer.strip()
            })
            # start new buffer with overlap
            buffer = text[-overlap_chars:]
            buffer_start = seg["start"]
            buffer_end = seg["end"]
    if buffer:
        chunks.append({"start": buffer_start, "end": buffer_end, "text": buffer.strip()})
    return chunks

In [12]:
chunks=chunk_segments(transcript["segments"])
print(chunks)

[{'start': 0.0, 'end': 82.4, 'text': "MCP vs API. Will MCP replace API? MCP flips this completely. AI, models and AI agents. APIs are like a restaurant menu. No pre-written code, no manual integration. With APIs, discovery is static. If you asked an AI agent to order you a pizza, book a doctor's appointment, and send an email to your boss, all in one go, could it actually do that? The answer is yes, but not with traditional APIs. It's with MCP, Model Context Protocol. If you're new here, I am Priyanka and on this channel, we break down cloud and AI technologies for developers and tech practitioners who want to stay ahead of the curve. Now today, we're diving into MCP vs API, a debate that is fundamentally changing how you think about AI, agent and development. Let's start with one sentence difference between MCP and APIs. Now APIs are built for human developers to manually integrate software systems. While MCP is specifically designed for AI, models and AI agents to dynamically and aut

In [22]:

import getpass
import os

# os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()
os.environ["PINECONE_API_KEY"] = getpass.getpass()

··········


In [26]:
from pinecone import Pinecone, ServerlessSpec

print(PINECONE_API_KEY)

pc = Pinecone(api_key="pcsk_6BUe3k_5E2SpCom69vAzzjDpMifRw7dzkkSeXM3Fw2jKeu9J46bgJ1Vrt3fbaXEDjmp4uY")

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

...


In [27]:


# # Config
# PINECONE_API_KEY = "..."
# PINECONE_ENV = "..."



# embeddings (sentence-transformers)
embedder = SentenceTransformer(EMBED_MODEL)

def embed_texts(texts):
    return embedder.encode(texts, show_progress_bar=False).tolist()

# pinecone init & upsert
# pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
index_name = "youtube-text-demo"
import time

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384,
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()



{'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '150',
                                    'content-type': 'application/json',
                                    'date': 'Tue, 25 Nov 2025 13:23:19 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '57',
                                    'x-pinecone-request-id': '337189916962666050',
                                    'x-pinecone-request-latency-ms': '56'}},
 'dimension': 384,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'storageFullness': 0.0,
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [29]:

def upsert_chunks(video_id, title, chunks):
    texts = [c["text"] for c in chunks]
    embeddings = embed_texts(texts)
    items = []
    for i,(chunk, emb) in enumerate(zip(chunks, embeddings)):
        item = {
            "id": f"{video_id}_chunk_{i}",
            "metadata": {
                "video_id": video_id,
                "start_time": chunk["start"],
                "end_time": chunk["end"],
                "text": chunk["text"],
                "title": title
            },
            "values": emb
        }
        items.append(item)
    # upsert in batches
    index.upsert(vectors=[(it["id"], it["values"], it["metadata"]) for it in items])

In [30]:
    print("Uploading to Pinecone...")
    upsert_chunks("dwlE7TiDXz40","MCP vs API: What Every Developer Needs to Know",chunks)

Uploading to Pinecone...
