In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter

# Get the video ID of youtube video from url
def extract_video_id(url):
    if "youtube.com" in url:
        return url.split("v=")[-1].split("&")[0]
    elif "youtu.be" in url:
        return url.split("/")[-1]
    else:
        raise ValueError("Invalid YouTube URL")
    
def load_youtube_transcript(url):
    try:
        video_id = extract_video_id(url)
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return transcript
    except Exception as e:
        return e

In [24]:
import os
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec, pinecone
from dotenv import load_dotenv

# Retrieve API Key from .env file
load_dotenv()
open_api_key = os.getenv("OPENAI_API_KEY")


# Initialize OpenAI Client
client = OpenAI(api_key=open_api_key)

# Time to split the transcript into chunks and store in a vector store

def collect_transcript(transcript):
    text = ""
    for text_obj in transcript:
        text += text_obj["text"] + " "

    return text

def split_text(text, max_length=500):
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=max_length,  # The maximum size of each chunk
    chunk_overlap=200,  # Overlap between chunks to maintain context
    )

    return text_splitter.split_text(text)

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [36]:

dimensions = 1536
index_name = "youtube-transcript-index"

# Get Pinecone API Key
pinecone_api_key = os.getenv("PINECONE_API_KEY")

# Initialize the vector store
pc = Pinecone(api_key=pinecone_api_key)

if index_name not in pc.list_indexes().names():
    pc.create_index(
    name=index_name,
    dimension=dimensions,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

index = pc.Index(index_name)

In [27]:
# Upsert the embedded vector
def upsert_vector(text, embedding):
    index.upsert([(text, embedding)])

def query_embedding(query_text):
    query_embedding = get_embedding(query_text)
    result = index.query(vector=[query_embedding], top_k=5)
    return result

In [38]:
# Load a Youtube Transcript and store in Pinecone
def load_youtube_transcript_and_store(url):
    transcript = load_youtube_transcript(url)
    chunks = split_text(collect_transcript(transcript))

    for chunk in chunks:
        embedding = get_embedding(chunk)
        upsert_vector(chunk, embedding)



# Combine all the documents matched with a query into a single document

load_youtube_transcript_and_store("https://www.youtube.com/watch?v=aZjYr87r1b8")
query_result = query_embedding("What are B Trees useful for?")
print(query_result)

{'matches': [{'id': 'hi the topic is Beatriz and B plus trees this is one of '
                    'the important topic and is a difficult topic for the '
                    'students to understand this is useful in databases what '
                    'are the things that are required for understanding B '
                    'trees and B plus trees thoroughly this is the list of '
                    "topics if you understand all these things hence you'll "
                    'know all these things then you can understand what B '
                    'trees and B plus trees are mostly type people try to '
                    'understand how insertion and deletion is done just they '
                    'try to',
              'score': 0.584004879,
              'values': []},
             {'id': 'cover all these topics that are related to B and B plus '
                    'trees and help you understand perfectly what B plus a and '
                    'B and B plus trees are let m