In [1]:
from dotenv import load_dotenv
import os

load_dotenv(dotenv_path="./.env", override=True) 

api_key = os.getenv("OPENAI_API_KEY")


## Install libraries

In [None]:
!pip install -q youtube-transcript-api langchain-community langchain-openai \
               faiss-cpu tiktoken python-dotenv

In [2]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled

## Step 1a - Indexing (Document Ingestion)

In [None]:
video_id = "FWAdfuPpLOc" # mr beast's latest video id ... lol
preferred_languages = ["en"]

try:
    transcripts = YouTubeTranscriptApi.list_transcripts(video_id)

    try:
        transcript_obj = transcripts.find_transcript(preferred_languages)
        print("✅ Using transcript in preferred language:", transcript_obj.language_code)
        transcript = transcript_obj.fetch()

    except NoTranscriptFound:
        print("⚠️ Preferred language not found. Trying fallback...")

        if transcripts._generated_transcripts:
            fallback_obj = list(transcripts._generated_transcripts.values())[0]
            print("✅ Using fallback transcript:", fallback_obj.language_code)
            transcript = fallback_obj.fetch()
        else:
            print("❌ No generated transcripts available.")
            transcript = None

    if transcript:
        if hasattr(transcript, "snippets"):
            # ✅ THIS IS WHAT YOU HAVE
            text = " ".join(snippet.text for snippet in transcript.snippets)
        elif isinstance(transcript, list):
            text = " ".join(chunk["text"] for chunk in transcript)
        elif hasattr(transcript, "to_list"):
            text = " ".join(chunk["text"] for chunk in transcript.to_list())
        else:
            raise TypeError("Unsupported transcript format")

        print("\n📄 Final Transcript Preview:\n", text[:1000])  # show a snippet
    else:
        print("❌ No transcript found.")

except TranscriptsDisabled:
    print("❌ Transcripts are disabled for this video.")
except Exception as e:
    print(f"❌ Unexpected error: {e}")


✅ Using transcript in preferred language: en

📄 Final Transcript Preview:
 Who's faster? A cheetah or one of
the fastest cars in the world? Can a man beat a bear in
a fight? Can a tiger beat the world-record
high jumper? We're gonna answer all of these
and more— starting with the strongest man
in the world. And he's about to test his strength
against a 500-pound lion in a game of
tug-of-war. Any last words? It's go time. 3, 2, 1… Go! Alright and
we’re off. Oh my gosh!
Look at his mouth! Look at that
brute strength. Oh my gosh. The lion’s literally holding
the strongest man in the world back right now! Wait, what is he doing? Did that lion just
turn around? It looks like he's trying to
wear Brian out. Here's Brian pulling
a literal plane. But right now, he cannot get
this lion to budge. He’s starting
to struggle! Brian's actually
doing this! Oh, ok. Ok. You got like
3 feet left. You're almost
there! Brian, you’re almost
there! Brian won! Brian won, congrats! Yes! Brian just won! Did you

In [4]:
text

'Who\'s faster? A cheetah or one of\nthe fastest cars in the world? Can a man beat a bear in\na fight? Can a tiger beat the world-record\nhigh jumper? We\'re gonna answer all of these\nand more— starting with the strongest man\nin the world. And he\'s about to test his strength\nagainst a 500-pound lion in a game of\ntug-of-war. Any last words? It\'s go time. 3, 2, 1… Go! Alright and\nwe’re off. Oh my gosh!\nLook at his mouth! Look at that\nbrute strength. Oh my gosh. The lion’s literally holding\nthe strongest man in the world back right now! Wait, what is he doing? Did that lion just\nturn around? It looks like he\'s trying to\nwear Brian out. Here\'s Brian pulling\na literal plane. But right now, he cannot get\nthis lion to budge. He’s starting\nto struggle! Brian\'s actually\ndoing this! Oh, ok. Ok. You got like\n3 feet left. You\'re almost\nthere! Brian, you’re almost\nthere! Brian won! Brian won, congrats! Yes! Brian just won! Did you have fun? Oh, ok.\nI just— I won\'t talk to h

## Step 1b - Indexing (Text Splitting)

In [5]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([text])

In [6]:
len(chunks)

26

In [7]:
chunks

[Document(metadata={}, page_content="Who's faster? A cheetah or one of\nthe fastest cars in the world? Can a man beat a bear in\na fight? Can a tiger beat the world-record\nhigh jumper? We're gonna answer all of these\nand more— starting with the strongest man\nin the world. And he's about to test his strength\nagainst a 500-pound lion in a game of\ntug-of-war. Any last words? It's go time. 3, 2, 1… Go! Alright and\nwe’re off. Oh my gosh!\nLook at his mouth! Look at that\nbrute strength. Oh my gosh. The lion’s literally holding\nthe strongest man in the world back right now! Wait, what is he doing? Did that lion just\nturn around? It looks like he's trying to\nwear Brian out. Here's Brian pulling\na literal plane. But right now, he cannot get\nthis lion to budge. He’s starting\nto struggle! Brian's actually\ndoing this! Oh, ok. Ok. You got like\n3 feet left. You're almost\nthere! Brian, you’re almost\nthere! Brian won! Brian won, congrats! Yes! Brian just won! Did you have fun? Oh, ok.

## Step 1c & 1d - Indexing (Embedding Generation and Storing in Vector Store)

In [8]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = FAISS.from_documents(chunks, embeddings)

In [9]:
vector_store.index_to_docstore_id

{0: 'c330ec32-ad94-43a6-845e-e660bd01d574',
 1: '27e84a17-f0e2-467a-9be6-4902f7295861',
 2: 'bc0bb126-bdf9-409f-92cd-ac839b56ce59',
 3: 'c5794fba-4379-4357-9106-e76c5eb2fb81',
 4: 'f4e64f30-3252-4945-888d-2bc31a555dd6',
 5: '420b45d1-738a-48aa-a0fc-07e41eaa768c',
 6: 'fb61abf5-e297-43db-b312-c82472d0435f',
 7: 'cfad0e27-53c1-48d4-86f0-60eb17703bf4',
 8: '59596320-31e4-4b86-b83e-c627a54bb9d7',
 9: '87d712e2-fc26-43a1-8337-4595a1dd028c',
 10: '7341134c-b6ee-4bee-800d-0bf7cf2aff26',
 11: '9b586bdf-7d3f-41dc-b4e8-ec51af402ac1',
 12: '8847cfb6-56ab-4299-938c-c26166cf740c',
 13: '87e17349-b5b0-4451-b577-69cfc614ddab',
 14: 'd60ca191-3f4b-4a66-a148-fea62f0fe9ff',
 15: '479fb6ff-e04a-47c6-a699-c11831d73d1c',
 16: 'a24cb610-0dfb-4d8a-a3a8-4c9a1a7c403b',
 17: '5123d966-e7a8-4d39-baab-71ea53a7b1b2',
 18: 'b8a9b30b-e264-426b-9e1d-27594ad1ffd5',
 19: '3df1c2a7-e830-4194-89e3-9b959d46dc21',
 20: 'c376ec7d-95f1-4cad-85db-c1e6ca912083',
 21: '85cb71e9-ac19-4bc1-944f-f7248bd43a22',
 22: 'ad42b7b0-c9b9-

In [10]:
vector_store.get_by_ids(['f3a56725-441c-450a-b148-85d0d72199be'])

[]

## Step 2 - Retrieval

In [11]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [12]:
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001739EE18A90>, search_kwargs={'k': 4})

In [13]:
retriever.invoke('whos is brian shaw')

[Document(id='c330ec32-ad94-43a6-845e-e660bd01d574', metadata={}, page_content="Who's faster? A cheetah or one of\nthe fastest cars in the world? Can a man beat a bear in\na fight? Can a tiger beat the world-record\nhigh jumper? We're gonna answer all of these\nand more— starting with the strongest man\nin the world. And he's about to test his strength\nagainst a 500-pound lion in a game of\ntug-of-war. Any last words? It's go time. 3, 2, 1… Go! Alright and\nwe’re off. Oh my gosh!\nLook at his mouth! Look at that\nbrute strength. Oh my gosh. The lion’s literally holding\nthe strongest man in the world back right now! Wait, what is he doing? Did that lion just\nturn around? It looks like he's trying to\nwear Brian out. Here's Brian pulling\na literal plane. But right now, he cannot get\nthis lion to budge. He’s starting\nto struggle! Brian's actually\ndoing this! Oh, ok. Ok. You got like\n3 feet left. You're almost\nthere! Brian, you’re almost\nthere! Brian won! Brian won, congrats! Yes

## Step 3 - Augmentation

In [14]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

In [15]:
prompt = PromptTemplate(
    template="""
      You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [16]:
question          = "According to the video who jumps higher a tiger or a man?"
retrieved_docs    = retriever.invoke(question)

In [17]:
retrieved_docs

[Document(id='27e84a17-f0e2-467a-9be6-4902f7295861', metadata={}, page_content="3 feet left. You're almost\nthere! Brian, you’re almost\nthere! Brian won! Brian won, congrats! Yes! Brian just won! Did you have fun? Oh, ok.\nI just— I won't talk to him.\nThat's fine. But that's just the first of\nseven challenges that will test man\nversus animal. Up next, we have the highest\nvertical leaper on the planet. No joke.\nThat's his Guinness World Record. Of all 8 billion humans alive,\nnone can jump higher than him. And he's gonna be going against this\ntiger to see who can jump higher… man or animal. Team Human.\nLet’s go. Look at that majestic beast. Do you actually think you\ncan out-jump that? I’m not gonna lie. If she jumps 13,\nI’m jumping 13. Oh, look how she's looking at him.\nShe's scoping you out. Is that a threat? She doesn't speak\nEnglish. I say we get into it. They’re gonna use this meat\nto lure her and see if they can get her\nto jump. - Let me see what you got, Luna.\n- Oh,

In [18]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

'3 feet left. You\'re almost\nthere! Brian, you’re almost\nthere! Brian won! Brian won, congrats! Yes! Brian just won! Did you have fun? Oh, ok.\nI just— I won\'t talk to him.\nThat\'s fine. But that\'s just the first of\nseven challenges that will test man\nversus animal. Up next, we have the highest\nvertical leaper on the planet. No joke.\nThat\'s his Guinness World Record. Of all 8 billion humans alive,\nnone can jump higher than him. And he\'s gonna be going against this\ntiger to see who can jump higher… man or animal. Team Human.\nLet’s go. Look at that majestic beast. Do you actually think you\ncan out-jump that? I’m not gonna lie. If she jumps 13,\nI’m jumping 13. Oh, look how she\'s looking at him.\nShe\'s scoping you out. Is that a threat? She doesn\'t speak\nEnglish. I say we get into it. They’re gonna use this meat\nto lure her and see if they can get her\nto jump. - Let me see what you got, Luna.\n- Oh, she wants it. Here it is. Here comes\nthe jump. How high will the tig

In [19]:
final_prompt = prompt.invoke({"context": context_text, "question": question})

In [20]:
final_prompt

StringPromptValue(text='\n      You are a helpful assistant.\n      Answer ONLY from the provided transcript context.\n      If the context is insufficient, just say you don\'t know.\n\n      3 feet left. You\'re almost\nthere! Brian, you’re almost\nthere! Brian won! Brian won, congrats! Yes! Brian just won! Did you have fun? Oh, ok.\nI just— I won\'t talk to him.\nThat\'s fine. But that\'s just the first of\nseven challenges that will test man\nversus animal. Up next, we have the highest\nvertical leaper on the planet. No joke.\nThat\'s his Guinness World Record. Of all 8 billion humans alive,\nnone can jump higher than him. And he\'s gonna be going against this\ntiger to see who can jump higher… man or animal. Team Human.\nLet’s go. Look at that majestic beast. Do you actually think you\ncan out-jump that? I’m not gonna lie. If she jumps 13,\nI’m jumping 13. Oh, look how she\'s looking at him.\nShe\'s scoping you out. Is that a threat? She doesn\'t speak\nEnglish. I say we get into i

## Step 4 - Generation

In [21]:
answer = llm.invoke(final_prompt)
print(answer.content)

According to the video, a man can jump higher than a tiger, as Brian set a new world record by jumping 12 feet, 3 inches, which is higher than the tiger's jump.


## Building a Chain

In [22]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [23]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [24]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [25]:
parallel_chain.invoke('how high does the tiger jump in the video?')

{'context': '3 feet left. You\'re almost\nthere! Brian, you’re almost\nthere! Brian won! Brian won, congrats! Yes! Brian just won! Did you have fun? Oh, ok.\nI just— I won\'t talk to him.\nThat\'s fine. But that\'s just the first of\nseven challenges that will test man\nversus animal. Up next, we have the highest\nvertical leaper on the planet. No joke.\nThat\'s his Guinness World Record. Of all 8 billion humans alive,\nnone can jump higher than him. And he\'s gonna be going against this\ntiger to see who can jump higher… man or animal. Team Human.\nLet’s go. Look at that majestic beast. Do you actually think you\ncan out-jump that? I’m not gonna lie. If she jumps 13,\nI’m jumping 13. Oh, look how she\'s looking at him.\nShe\'s scoping you out. Is that a threat? She doesn\'t speak\nEnglish. I say we get into it. They’re gonna use this meat\nto lure her and see if they can get her\nto jump. - Let me see what you got, Luna.\n- Oh, she wants it. Here it is. Here comes\nthe jump. How high 

In [26]:
parser = StrOutputParser()

In [27]:
main_chain = parallel_chain | prompt | llm | parser

In [28]:
main_chain.invoke('who jumps higher man or tiger in the video?')

"In the video, the man jumps higher than the tiger. Brian set a new world record by jumping 12 feet, 3 inches, which was higher than the tiger's jump."