<a href="https://colab.research.google.com/github/s11khushboo/youtube-QandA/blob/main/preprocessing-video.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
!pip install yt-dlp openai-whisper sentence-transformers pinecone





In [57]:
!pip install langchain-openai



In [58]:
# ingest.py (simplified)
from yt_dlp import YoutubeDL
import whisper
from sentence_transformers import SentenceTransformer
import pinecone
import uuid
import math
import time
from pinecone import Pinecone, ServerlessSpec
from urllib.parse import urlparse, parse_qs
from langchain_openai import OpenAI
from langchain_core.prompts import PromptTemplate  # pseudo imports



INDEX_NAME = "youtube-chunks"
EMBED_MODEL = "all-MiniLM-L6-v2"  # or OpenAI embeddings
WHISPER_MODEL = "small"



In [59]:
def download_audio(youtube_url, out_path="audio.mp3"):
    ydl_opts = {"format": "bestaudio/best", "outtmpl": out_path}
    # download audio
    with YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])
        info = ydl.extract_info(youtube_url, download=False)
        title = info.get("title", None)
    return out_path ,title

In [60]:

def get_video_id(url: str):
    # Extract video ID
    parsed = urlparse(url)
    if "youtu.be" in parsed.hostname:
        video_id = parsed.path[1:]
    elif "watch" in parsed.path:
        video_id = parse_qs(parsed.query)["v"][0]
    elif parsed.path.startswith("/shorts/") or parsed.path.startswith("/embed/"):
        video_id = parsed.path.split("/")[2]
    else:
        raise ValueError("Unsupported YouTube URL format.")
    return video_id

In [61]:
# transcribe
def transcribe_whisper(audio_path):
    model = whisper.load_model(WHISPER_MODEL)
    result = model.transcribe(audio_path, task="transcribe")  # returns segments with timestamps
    return result

In [62]:

# chunking with overlap
def chunk_segments(segments, max_chars=1000, overlap_chars=200):
    chunks = []
    buffer = ""
    buffer_start = None
    buffer_end = None
    for seg in segments:
        text = seg["text"].strip()
        if not buffer:
            buffer_start = seg["start"]
        if len(buffer) + len(text) <= max_chars:
            buffer += (" " + text)
            buffer_end = seg["end"]
        else:
            chunks.append({
                "start": buffer_start, "end": buffer_end, "text": buffer.strip()
            })
            # start new buffer with overlap
            buffer = text[-overlap_chars:]
            buffer_start = seg["start"]
            buffer_end = seg["end"]
    if buffer:
        chunks.append({"start": buffer_start, "end": buffer_end, "text": buffer.strip()})
    return chunks

In [63]:
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
os.environ["PINECONE_KEY"] = userdata.get("PINECONE_KEY")


In [64]:


pc = Pinecone(api_key=os.environ["PINECONE_KEY"])

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [65]:
# embeddings (sentence-transformers)
embedder = SentenceTransformer(EMBED_MODEL)

def embed_texts(texts):
    return embedder.encode(texts, show_progress_bar=False).tolist()

In [66]:
index_name = "youtube-text-demo"


# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384,
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()



{'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '183',
                                    'content-type': 'application/json',
                                    'date': 'Wed, 26 Nov 2025 15:15:59 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '36',
                                    'x-pinecone-request-id': '4920664976898930262',
                                    'x-pinecone-request-latency-ms': '35'}},
 'dimension': 384,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'__default__': {'vector_count': 12}},
 'storageFullness': 0.0,
 'total_vector_count': 12,
 'vector_type': 'dense'}

In [67]:

def upsert_chunks(video_id, title, chunks):
    texts = [c["text"] for c in chunks]
    embeddings = embed_texts(texts)  # this should be list of lists
    if hasattr(embeddings, "tolist"):
            embeddings = embeddings.tolist()
    vectors = []
    for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
        # ensure emb is a plain Python list
        if not isinstance(emb, list):
            emb = emb.tolist()

        # ensure metadata contains only serializable types
        metadata = {
            "video_id": video_id,
            "start_time": float(chunk["start"]),
            "end_time": float(chunk["end"]),
            "text": str(chunk["text"]),
            "title": str(title)
        }

        vectors.append(
            (f"{video_id}_chunk_{i}", emb, metadata)
        )

    # upsert all vectors
    index.upsert(vectors=vectors)
    print(f"Upserted {len(vectors)} chunks for video {video_id}")


In [68]:
# def ingest_youtube_video(url):
#     print("Downloading audio...")
#     audio_path,title = download_audio(url)
#     transcript = transcribe_whisper(audio_path)
#     chunks=chunk_segments(transcript["segments"])
#     video_id=get_video_id(url)
#     upsert_chunks(video_id,title,chunks)
#     return f"Successfully ingested video: {url}. Chunks: {len(chunks)}"

In [69]:
from langchain.tools import tool

@tool
def ingest_youtube_video(url: str) -> str:
    """Ingest a YouTube video by downloading audio, transcribing, and storing chunks.

    Args:
        url: The YouTube video URL to ingest

    Returns:
        Success message with number of chunks ingested
    """
    print("Downloading audio...")
    audio_path, title = download_audio(url)
    transcript = transcribe_whisper(audio_path)
    chunks = chunk_segments(transcript["segments"])
    video_id = get_video_id(url)
    upsert_chunks(video_id, title, chunks)
    return f"Successfully ingested video: {url}. Chunks: {len(chunks)}"

In [71]:
!pip show langchain

Name: langchain
Version: 1.0.8
Summary: Building applications with LLMs through composability
Home-page: https://docs.langchain.com/
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.12/dist-packages
Requires: langchain-core, langgraph, pydantic
Required-by: 


In [72]:
# def search_vector_db(query: str):
#        # 1) embed query
#       q_emb = embedder.encode([query])[0]
#       if hasattr(q_emb, "tolist"):
#           q_emb = q_emb.tolist()
#       # 2) search vector DB
#       results = index.query(
#         vector=q_emb,
#         top_k=6,             # number of nearest neighbors
#         include_metadata=True
#       )
#       return results

In [73]:
from langchain.tools import tool

@tool
def search_vector_db(query: str) -> str:
    """Search the vector database for documents similar to the query.

    Args:
        query: The search query to find similar documents

    Returns:
        Search results with metadata
    """
    # 1) embed query
    q_emb = embedder.encode([query])[0]
    if hasattr(q_emb, "tolist"):
        q_emb = q_emb.tolist()

    # 2) search vector DB
    results = index.query(
        vector=q_emb,
        top_k=6,             # 6 nearest neighbors
        include_metadata=True
    )
   # 3) build context
    context = ""
    for r in results["matches"]:
        md = r["metadata"]
        context += f"[{md['start_time']:.1f}s - {md['end_time']:.1f}s] {md['text']}\n\n"

    return context

In [74]:
# def create_search_agent(user_query, conversation_id=None):
#     results=search_vector_db(user_query)



#     # 4) system + user prompt
#     system_prompt = "You are an assistant that answers queries using ONLY the provided video excerpts"
#     # query = f"{system_prompt}\n\nContext:\n{context}\n\nQuestion: {user_query}\nAnswer"
#     # 5) call LLM (could be OpenAI or local)

#     llm = OpenAI(model="gpt-3.5-turbo-instruct",temperature = 0.0,openai_api_key=os.environ["OPENAI_API_KEY"])


#     agent = create_agent(
#     model=llm,
#     tools=[search_vector_db],
#     system_prompt="You are a document search assistant. Use the vector database to find relevant documents."
#     )
#     return agent



In [80]:
from langchain.agents import create_agent
from langchain_openai import ChatOpenAI
from langgraph.checkpoint.memory import MemorySaver
import uuid

def answer_query(user_query):
    """Create and return a configured search agent."""

    llm = ChatOpenAI(model="gpt-3.5-turbo",openai_api_key=os.environ["OPENAI_API_KEY"])
    checkpointer = MemorySaver()


    agent = create_agent(
        model=llm,
        tools=[ingest_youtube_video,search_vector_db],
         system_prompt="""You are a video knowledge management assistant.
          You can:
          1. Ingest YouTube videos into the knowledge base
          2. Search for relevant information in previously ingested videos
          When a user provides a YouTube URL, ingest it. When they ask questions, search the knowledge base.""",
        checkpointer=checkpointer  # Enable memory
    )
    thread_id = str(uuid.uuid4())
    result=agent.invoke({
    "messages": [{"role": "user", "content":user_query }]
   },
   config={"configurable": {"thread_id": thread_id}} )
    return result

In [81]:
question = "Explain MCP in simple terms."
result = answer_query(question)
answer = result["messages"][-1].content
print("Final Answer:")
print(answer)

Final Answer:
MCP stands for Model Context Protocol. It is a protocol that enables AI agents to dynamically and autonomously interact with software systems. Unlike traditional APIs, which require manual integration by human developers, MCP allows AI agents to access a server that provides a machine-readable menu of its capabilities. This means the AI agent can understand and use the available tools without the need for manual coding. MCP complements APIs by serving as a smart universal remote control for AI agents to operate software systems effectively. While APIs are like a restaurant menu where human developers place specific orders, MCP allows AI agents to make intelligent decisions based on the available tools without explicit coding for every action.


In [82]:
# question = "https://www.youtube.com/watch?v=LPZh9BOjkQs"
# result = answer_query(question)
# answer = result["messages"][-1].content
# print("Final Answer:")
# print(answer)

Downloading audio...
[youtube] Extracting URL: https://www.youtube.com/watch?v=LPZh9BOjkQs
[youtube] LPZh9BOjkQs: Downloading webpage




[youtube] LPZh9BOjkQs: Downloading android sdkless player API JSON
[youtube] LPZh9BOjkQs: Downloading web safari player API JSON




[youtube] LPZh9BOjkQs: Downloading m3u8 information




[info] LPZh9BOjkQs: Downloading 1 format(s): 251-11
[download] audio.mp3 has already been downloaded
[download] 100% of    7.79MiB
[youtube] Extracting URL: https://www.youtube.com/watch?v=LPZh9BOjkQs
[youtube] LPZh9BOjkQs: Downloading webpage




[youtube] LPZh9BOjkQs: Downloading android sdkless player API JSON
[youtube] LPZh9BOjkQs: Downloading web safari player API JSON




[youtube] LPZh9BOjkQs: Downloading m3u8 information




Upserted 8 chunks for video LPZh9BOjkQs
Final Answer:
The YouTube video has been successfully ingested, and it has been divided into 8 chunks for analysis. What would you like to know about this video?


In [83]:
question = "what is back prapogation"
result = answer_query(question)
answer = result["messages"][-1].content
print("Final Answer:")
print(answer)

Final Answer:
Backpropagation is an algorithm used to tweak all the parameters in a language model during training. The parameters in a language model are continuously refined based on many example pieces of text. Backpropagation is used to adjust these parameters based on the predictions made by the model compared to the true values in the training data. This process helps the model learn and improve its accuracy in making predictions.
