In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter

# Get the video ID of youtube video from url
def extract_video_id(url):
    if "youtube.com" in url:
        return url.split("v=")[-1].split("&")[0]
    elif "youtu.be" in url:
        return url.split("/")[-1]
    else:
        raise ValueError("Invalid YouTube URL")
    
def load_youtube_transcript(url):
    try:
        video_id = extract_video_id(url)
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return transcript
    except Exception as e:
        return e

In [60]:
import os
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec, pinecone
from dotenv import load_dotenv

# Retrieve API Key from .env file
load_dotenv()
open_api_key = os.getenv("OPENAI_API_KEY")


# Initialize OpenAI Client
client = OpenAI(api_key=open_api_key)

# Time to split the transcript into chunks and store in a vector store
def collect_transcript(transcript):
    text = ""
    for text_obj in transcript:
        text += text_obj["text"] + " "

    return text

def split_text(text, max_length=500):
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=max_length,  # The maximum size of each chunk
    chunk_overlap=200,  # Overlap between chunks to maintain context
    )

    return text_splitter.split_text(text)

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [36]:

dimensions = 1536
index_name = "youtube-transcript-index"

# Get Pinecone API Key
pinecone_api_key = os.getenv("PINECONE_API_KEY")

# Initialize the vector store
pc = Pinecone(api_key=pinecone_api_key)

if index_name not in pc.list_indexes().names():
    pc.create_index(
    name=index_name,
    dimension=dimensions,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

index = pc.Index(index_name)

In [27]:
# Upsert the embedded vector
def upsert_vector(text, embedding):
    index.upsert([(text, embedding)])

def query_embedding(query_text):
    query_embedding = get_embedding(query_text)
    result = index.query(vector=[query_embedding], top_k=5)
    return result

In [40]:
# Load a Youtube Transcript and store in Pinecone
def load_youtube_transcript_and_store(url):
    transcript = load_youtube_transcript(url)
    chunks = split_text(collect_transcript(transcript))

    for chunk in chunks:
        embedding = get_embedding(chunk)
        upsert_vector(chunk, embedding)

# Combine all the documents matched with a query into a single document
def combine_documents(documents):
    combined_text = ""
    all_matches = documents["matches"]
    for match in all_matches:
        combined_text += match["id"] + "\n"
    return combined_text

In [68]:
def get_response_from_query(query_text):
    # Simulate the result of querying embeddings (you would need to implement this part)
    result = query_embedding(query_text)
    documents = combine_documents(result)

    # Construct the prompt manually
    system_message = f"""You are a helpful AI teacher that specializes in whatever subject I give you readings of. 
    I will provide you with YouTube video transcripts and you will answer questions based on the content. 
    Here is the relevant content for the question:\n\n {documents} \n\n Only use information given from the transcript, if you do not have enough context to answer the question, say 'I do not have enough information to answer the question.'"""
    
    question_message = f"Answer the following question: {query_text}"

    # Use the OpenAI API to get the response
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": question_message}
        ],
        temperature=0.2
    )

    # Extract the response content
    answer = response.choices[0].message.content
    return answer, documents

    

In [70]:
video_url = "https://www.youtube.com/watch?v=aZjYr87r1b8"
# How do I implement my own B-trees?
query = input()
response, documents = get_response_from_query(query)
print(response)

To implement your own B-trees, you need to follow these steps based on the provided transcript:

1. **Understand the Node Structure**: Each node in a B-tree of degree 4 (as used in the example) can have up to 4 children and 3 keys. Each key in the node has a pointer to the corresponding database record.

2. **Insertion Process**: Start by inserting keys into the tree. Initially, there is no node, so the first key you insert becomes the root node.

3. **Splitting Nodes**: When a node exceeds its capacity (more than 3 keys in this example), you need to split the node. The middle key moves up to the parent node, and the remaining keys are split into two nodes. This process continues as you insert more keys, causing the tree to grow upwards.

4. **Guidelines for Creation**: Unlike a simple multi-way search tree, B-trees have specific rules for creation to ensure efficient searching. These rules help maintain balance in the tree, preventing it from becoming inefficient like a linear search.