<a href="https://colab.research.google.com/github/s11khushboo/youtube-QandA/blob/main/preprocessing-video.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
!pip install yt-dlp openai-whisper sentence-transformers pinecone  langchain-openai





In [20]:
# ingest.py (simplified)
from yt_dlp import YoutubeDL
import whisper
from sentence_transformers import SentenceTransformer
import uuid
import math
import time
from pinecone import Pinecone, ServerlessSpec
from urllib.parse import urlparse, parse_qs
from langchain_core.prompts import PromptTemplate  # pseudo imports
from langchain.agents import create_agent
from langchain_openai import ChatOpenAI
from langgraph.checkpoint.memory import MemorySaver
import uuid



INDEX_NAME = "youtube-chunks"
EMBED_MODEL = "all-MiniLM-L6-v2"  # or OpenAI embeddings
WHISPER_MODEL = "small"



In [21]:
def download_audio(youtube_url, out_path="audio.mp3"):
    ydl_opts = {"format": "bestaudio/best", "outtmpl": out_path}
    # download audio
    with YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])
        info = ydl.extract_info(youtube_url, download=False)
        title = info.get("title", None)
    return out_path ,title

In [22]:

def get_video_id(url: str):
    # Extract video ID
    parsed = urlparse(url)
    if "youtu.be" in parsed.hostname:
        video_id = parsed.path[1:]
    elif "watch" in parsed.path:
        video_id = parse_qs(parsed.query)["v"][0]
    elif parsed.path.startswith("/shorts/") or parsed.path.startswith("/embed/"):
        video_id = parsed.path.split("/")[2]
    else:
        raise ValueError("Unsupported YouTube URL format.")
    return video_id

In [23]:
# transcribe
def transcribe_whisper(audio_path):
    model = whisper.load_model(WHISPER_MODEL)
    result = model.transcribe(audio_path, task="transcribe")  # returns segments with timestamps
    return result

In [24]:

# chunking with overlap
def chunk_segments(segments, max_chars=1000, overlap_chars=200):
    chunks = []
    buffer = ""
    buffer_start = None
    buffer_end = None
    for seg in segments:
        text = seg["text"].strip()
        if not buffer:
            buffer_start = seg["start"]
        if len(buffer) + len(text) <= max_chars:
            buffer += (" " + text)
            buffer_end = seg["end"]
        else:
            chunks.append({
                "start": buffer_start, "end": buffer_end, "text": buffer.strip()
            })
            # start new buffer with overlap
            buffer = text[-overlap_chars:]
            buffer_start = seg["start"]
            buffer_end = seg["end"]
    if buffer:
        chunks.append({"start": buffer_start, "end": buffer_end, "text": buffer.strip()})
    return chunks

In [25]:
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
os.environ["PINECONE_KEY"] = userdata.get("PINECONE_KEY")


In [26]:


pc = Pinecone(api_key=os.environ["PINECONE_KEY"])

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [34]:
# embeddings (sentence-transformers)
embedder = SentenceTransformer(EMBED_MODEL)

def embed_texts(texts):
    return embedder.encode(texts, show_progress_bar=False).tolist()

In [28]:
index_name = "youtube-text-demo"


# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384,
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()



{'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '183',
                                    'content-type': 'application/json',
                                    'date': 'Fri, 28 Nov 2025 13:31:09 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '86',
                                    'x-pinecone-request-id': '7092592298742738187',
                                    'x-pinecone-request-latency-ms': '86'}},
 'dimension': 384,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'__default__': {'vector_count': 88}},
 'storageFullness': 0.0,
 'total_vector_count': 88,
 'vector_type': 'dense'}

In [None]:

def upsert_chunks(video_id, title, chunks):
    texts = [c["text"] for c in chunks]
    embeddings = embed_texts(texts)  # this should be list of lists
    if hasattr(embeddings, "tolist"):
            embeddings = embeddings.tolist()
    vectors = []
    for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
        # ensure emb is a plain Python list
        if not isinstance(emb, list):
            emb = emb.tolist()

        # ensure metadata contains only serializable types
        metadata = {
            "video_id": video_id,
            "start_time": float(chunk["start"]),
            "end_time": float(chunk["end"]),
            "text": str(chunk["text"]),
            "title": str(title)
        }

        vectors.append(
            (f"{video_id}_chunk_{i}", emb, metadata)
        )

    # upsert all vectors
    index.upsert(vectors=vectors)
    print(f"Upserted {len(vectors)} chunks for video {video_id}")


In [56]:
from langchain.tools import tool

@tool
def ingest_youtube_video(url: str) -> str:
    """Ingest a YouTube video by downloading audio, transcribing, and storing chunks.

    Args:
        url: The YouTube video URL to ingest

    Returns:
        Success message with number of chunks ingested
    """
    print("Downloading audio...")
    audio_path, title = download_audio(url)
    transcript = transcribe_whisper(audio_path)
    chunks = chunk_segments(transcript["segments"])
    video_id = get_video_id(url)
    upsert_chunks(video_id, title, chunks)
    return f"Successfully ingested video: {url}. Chunks: {len(chunks)}"

In [10]:
!pip install PyMuPDF pytesseract pillow


Collecting PyMuPDF
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract, PyMuPDF
Successfully installed PyMuPDF-1.26.6 pytesseract-0.3.13


In [29]:
import requests
def download_pdf(url):
    url = url
    local_path = "neurips_paper.pdf"
    response = requests.get(url)
    with open(local_path, "wb") as f:
         f.write(response.content)
    return local_path

In [31]:
!pip install langchain-text-splitters

Collecting langchain-text-splitters
  Downloading langchain_text_splitters-1.0.0-py3-none-any.whl.metadata (2.6 kB)
Downloading langchain_text_splitters-1.0.0-py3-none-any.whl (33 kB)
Installing collected packages: langchain-text-splitters
Successfully installed langchain-text-splitters-1.0.0


In [52]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
from langchain_text_splitters import RecursiveCharacterTextSplitter

def ingest_pdf(pdf_path):
    print(pdf_path)
    doc = fitz.open(pdf_path)
    vectors = []

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50
    )


    for page_num, page in enumerate(doc):

        # ---------- TEXT CHUNKS ----------
        page_text = page.get_text("text")

        if page_text.strip():
            chunks = splitter.split_text(page_text)

            for chunk_index, chunk_text in enumerate(chunks):

                metadata = {
                    "source_type": "pdf",
                    "source_name": pdf_path,
                    "page_number": page_num + 1,
                    "chunk_id": chunk_index,
                    "content_type": "text"
                }

                vectors.append({
                    "id": f"{pdf_path}_p{page_num+1}_c{chunk_index}",
                    "values": embed_texts(chunk_text),
                    "metadata": metadata
                })


        # ---------- IMAGE OCR CHUNKS ----------
        for img_index, img in enumerate(page.get_images()):
            xref = img[0]
            img_data = doc.extract_image(xref)["image"]
            img_obj = Image.open(io.BytesIO(img_data))

            ocr_text = pytesseract.image_to_string(img_obj)

            if ocr_text.strip():
                metadata = {
                    "source_type": "pdf",
                    "source_name": pdf_path,
                    "page_number": page_num + 1,
                    "image_index": img_index,
                    "chunk_id": f"img{img_index}",
                    "content_type": "image_text"
                }

                vectors.append({
                    "id": f"{pdf_path}_p{page_num+1}_img{img_index}",
                    "values": embed_texts(ocr_text),
                    "metadata": metadata
                })

        # ---------- UPSERT TO PINECONE ----------
        if vectors:
            index.upsert(vectors=vectors)
            print(f"Inserted {len(vectors)} chunks into Pinecone from {pdf_path}.")

In [46]:
from langchain.tools import tool

def ingest_pdf_tool(url: str) -> str:
    """Ingest a pdf by storing chunks.

    Args:
        url: pdf

    Returns:
        Success message with number of chunks ingested
    """
    local_url=download_pdf(url)
    print(local_url)
    ingest_pdf(local_url)


In [53]:
print(ingest_pdf("/content/neurips_paper.pdf"))

/content/neurips_paper.pdf
Inserted 7 chunks into Pinecone from /content/neurips_paper.pdf.
Inserted 17 chunks into Pinecone from /content/neurips_paper.pdf.
Inserted 22 chunks into Pinecone from /content/neurips_paper.pdf.
Inserted 29 chunks into Pinecone from /content/neurips_paper.pdf.
Inserted 36 chunks into Pinecone from /content/neurips_paper.pdf.
Inserted 45 chunks into Pinecone from /content/neurips_paper.pdf.
Inserted 53 chunks into Pinecone from /content/neurips_paper.pdf.
Inserted 61 chunks into Pinecone from /content/neurips_paper.pdf.
Inserted 67 chunks into Pinecone from /content/neurips_paper.pdf.
Inserted 74 chunks into Pinecone from /content/neurips_paper.pdf.
Inserted 79 chunks into Pinecone from /content/neurips_paper.pdf.
None


In [54]:
from langchain.tools import tool

@tool
def search_vector_db(query: str) -> str:
    """Search the vector database for documents similar to the query.

    Args:
        query: The search query to find similar documents

    Returns:
        Search results with metadata
    """
    # 1) embed query
    q_emb = embedder.encode([query])[0]
    if hasattr(q_emb, "tolist"):
        q_emb = q_emb.tolist()

    # 2) search vector DB
    results = index.query(
        vector=q_emb,
        top_k=6,             # 6 nearest neighbors
        include_metadata=True
    )
   # 3) build context
    context = ""
    for r in results["matches"]:
        md = r["metadata"]
        context += f"[{md['start_time']:.1f}s - {md['end_time']:.1f}s] {md['text']}\n\n"

    return context

In [55]:
def create_search_agent():
    """Create and return a configured search agent."""

    llm = ChatOpenAI(model="gpt-3.5-turbo",openai_api_key=os.environ["OPENAI_API_KEY"])
    checkpointer = MemorySaver()


    agent = create_agent(
        model=llm,
        tools=[ingest_youtube_video,search_vector_db],
         system_prompt="""You are a video knowledge management assistant.
          You can:
          1. Ingest YouTube videos into the knowledge base
          2. Search for relevant information in previously ingested videos
          When a user provides a YouTube URL, ingest it. When they ask questions, search the knowledge base.""",
        checkpointer=checkpointer  # Enable memory
    )


    return agent

In [57]:
agent = create_search_agent()

In [58]:

def answer_query(user_query):
   thread_id = str(uuid.uuid4())
   result=agent.invoke({
    "messages": [{"role": "user", "content":user_query }]
   },
   config={"configurable": {"thread_id": thread_id}} )
   answer = result["messages"][-1].content
   return answer

In [59]:
question = "Multi-Head Attention"
answer = answer_query(question)
print("Final Answer:")
print(answer)

KeyError: 'start_time'

In [None]:
# question = "https://www.youtube.com/watch?v=LPZh9BOjkQs"
# result = answer_query(question)
# answer = result["messages"][-1].content
# print("Final Answer:")
# print(answer)

In [None]:
# question = "what is back prapogation"
# result = answer_query(question)
# print("Final Answer:")
# print(answer)

In [None]:
# from transformers import AutoProcessor, AutoModel
# import soundfile as sf

# model_id = "suno/bark-small"

# processor = AutoProcessor.from_pretrained(model_id)
# model = AutoModel.from_pretrained(model_id)




In [None]:
inputs = processor("This is free text to speech using Bark.", return_tensors="pt")
audio = model.generate(**inputs)

audio_np = audio.cpu().numpy().squeeze()

sf.write("output.wav", audio_np, 22050)