In [None]:
from google.colab import userdata
import os

### Library Installation

In [None]:
!pip install -q youtube-transcript-api langchain-community langchain-openai \ faiss-cpu tiktoken python-dotenv

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate

### Indexing Step-1: Document Ingestion

In [None]:
video_id = "Gfr50f6ZBvo" ## only ID, not entire URL

In [None]:
try:
  transcript_list = YouTubeTranscriptApi.get_transcript(video_id) ## returns best lang if languages arg left empty

  # Flatten it to plain text
  transcript = " ".join(chunk['text'] for chunk in transcript_list)
  print(transcript)
except TranscriptsDisabled:
  print("No transcript/captions available for this video")

RequestBlocked: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=Gfr50f6ZBvo! This is most likely caused by:

YouTube is blocking requests from your IP. This usually is due to one of the following reasons:
- You have done too many requests and your IP has been blocked by YouTube
- You are doing requests from an IP belonging to a cloud provider (like AWS, Google Cloud Platform, Azure, etc.). Unfortunately, most IPs from cloud providers are blocked by YouTube.

There are two things you can do to work around this:
1. Use proxies to hide your IP address, as explained in the "Working around IP bans" section of the README (https://github.com/jdepoix/youtube-transcript-api?tab=readme-ov-file#working-around-ip-bans-requestblocked-or-ipblocked-exception).
2. (NOT RECOMMENDED) If you authenticate your requests using cookies, you will be able to continue doing requests for a while. However, YouTube will eventually permanently ban the account that you have used to authenticate with! So only do this if you don't mind your account being banned!

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!

In [None]:
transcript_list.__len__()

In [None]:
transcript_list[0]
## duration is the time for which the caption remains on the screen
## all times in seconds

In [None]:
with open("transcript.txt", "r", encoding="utf-8") as file:
    transcript = file.read()

### Indexing Step-2: Text-Splitting

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)

In [None]:
chunks = splitter.create_documents([transcript])

In [None]:
chunks[0]

Document(metadata={}, page_content="the following is a conversation with demus hasabis ceo and co-founder of deepmind a company that has published and builds some of the most incredible artificial intelligence systems in the history of computing including alfred zero that learned all by itself to play the game of gold better than any human in the world and alpha fold two that solved protein folding both tasks considered nearly impossible for a very long time demus is widely considered to be one of the most brilliant and impactful humans in the history of artificial intelligence and science and engineering in general this was truly an honor and a pleasure for me to finally sit down with him for this conversation and i'm sure we will talk many times again in the future this is the lex friedman podcast to support it please check out our sponsors in the description and now dear friends here's demis hassabis let's start with a bit of a personal question am i an ai program you wrote to inter

In [None]:
len(chunks)

168

### Step-3 & 4: Embedding Generation and Storing in Vector Store

In [None]:
embeddings = OpenAIEmbeddings(model = 'text-embedding-3-small')
vector_store = FAISS.from_documents(chunks[:10], embeddings)

In [None]:
!pip install langchain_core

In [None]:
import os, time
from tenacity import retry, wait_random_exponential, stop_after_attempt
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents.base import Document  # for types


# Retry decorator for RateLimitError
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def embed_batch(texts: list[str]) -> list[list[float]]:
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    return embeddings.embed_documents(texts)

def build_faiss_from_docs(documents: list[Document], batch_size: int = 32, rpm: int = 30):
    # Split docs into chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_documents(documents)

    vector_store = None
    interval = 60.0 / rpm

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i : i + batch_size]
        texts = [d.page_content for d in batch]
        embs = embed_batch(texts)

        if vector_store is None:
            vector_store = FAISS.from_embeddings(
                text_embeddings=zip(texts, embs),
                embedding=OpenAIEmbeddings(model="text-embedding-3-small"),
                metadatas=[d.metadata for d in batch],
            )
        else:
            vector_store.add_embeddings(
                text_embeddings=zip(texts, embs),
                metadatas=[d.metadata for d in batch],
            )

        time.sleep(interval)

    return vector_store

# Usage example:
# docs = [Document(page_content="Hello", metadata={}), ...]
# vs = build_faiss_from_docs(docs)
# vs.save_local("faiss_index")


In [None]:
!pip install ollama

Collecting ollama
  Downloading ollama-0.5.1-py3-none-any.whl.metadata (4.3 kB)
Downloading ollama-0.5.1-py3-none-any.whl (13 kB)
Installing collected packages: ollama
Successfully installed ollama-0.5.1


In [None]:
import ollama

response = ollama.embeddings(
    model='nomic-embed-text',
    prompt='Nom Nom Nom'
)

embedding = response['embedding']


In [None]:
len(embedding)

768

In [None]:
from langchain_community.embeddings import OllamaEmbeddings

In [None]:
embedding_model = OllamaEmbeddings(model="nomic-embed-text")

  embedding_model = OllamaEmbeddings(model="nomic-embed-text")


In [None]:
vector_store = FAISS.from_documents(chunks, embedding_model)


In [None]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x79e1c80526d0>

In [None]:
vector_store.save_local("./", index_name="vs1")

In [None]:
vector_store = FAISS.load_local("./", embeddings=embedding_model, index_name="vs1", allow_dangerous_deserialization=True)

In [None]:
!ls

sample_data  transcript.txt  vs1.faiss	vs1.pkl


In [None]:
vector_store.index_to_docstore_id

{0: '51fa834c-a177-4f7c-ab77-304081a6357f',
 1: 'f70d3ef3-1433-48a9-9f54-7ca222cdd250',
 2: '0877cdd8-2441-458d-b9b5-fa1c1711fc42',
 3: '122a4e51-a73e-421b-a1b7-ba056a836d3d',
 4: 'aceb4ed1-1b91-4e92-b4d8-245806301180',
 5: '6e50a7d0-92f9-4b14-9b28-a13d8464a5dc',
 6: 'b5876821-3791-4a6b-b606-56687f9cece7',
 7: 'bf9f3fed-aa42-49e6-ac27-96fe0de4fc83',
 8: '69764df4-94a6-40b5-8830-3d2624c75778',
 9: 'c39634cd-f875-4ff7-8da9-5cc2a8d04c54',
 10: '0a59ebca-5cea-4828-a5f6-d6b9c592abe1',
 11: '147a4719-573d-4254-911f-e2e120208048',
 12: '7fd1c397-2168-45bc-923c-d0489df05fcb',
 13: '5a2ba799-9116-4e30-b959-03e022a927ac',
 14: 'abfe6319-553d-4e29-bafc-fdb36459c820',
 15: 'fb322225-8eb7-46af-9541-5ae555ea78b1',
 16: 'eaa6ef23-d4b1-433b-b8c4-83315e7727a8',
 17: 'af8c02cb-ad40-4c09-9b6c-87e2d0e16e49',
 18: '776bae15-b5cf-42d8-9ba1-07eaf26b604d',
 19: '0f310932-2104-40f7-90bb-0f8cd64b18c1',
 20: 'fe379ad2-f7b2-409a-8d7a-bb80163ad926',
 21: '24cb4582-52d2-4397-af8d-9fd99300e569',
 22: '3dffb772-f375-

In [None]:
vector_store.get_by_ids(['c63e084d-5331-410e-b6ad-c85ffa949e56'])

[Document(id='c63e084d-5331-410e-b6ad-c85ffa949e56', metadata={}, page_content='demas establish to support this podcast please check out our sponsors in the description and now let me leave you with some words from edskar dykstra computer science is no more about computers than astronomy is about telescopes thank you for listening and hope to see you next time')]

### Retrieval

In [None]:
retriever = vector_store.as_retriever(search_type = 'similarity', search_kwargs = {'k' : 4})

In [None]:
retriever.invoke('What is Deepmind?')

[Document(id='3011c994-99cd-407d-a78d-04b63c5b8dcf', metadata={}, page_content="and how it works this is tough to uh ask you this question because you probably will say it's everything but let's let's try let's try to think to this because you're in a very interesting position where deepmind is the place of some of the most uh brilliant ideas in the history of ai but it's also a place of brilliant engineering so how much of solving intelligence this big goal for deepmind how much of it is science how much is engineering so how much is the algorithms how much is the data how much is the hardware compute infrastructure how much is it the software computer infrastructure yeah um what else is there how much is the human infrastructure and like just the humans interact in certain kinds of ways in all the space of all those ideas how much does maybe like philosophy how much what's the key if um uh if if you were to sort of look back like if we go forward 200 years look back what was the key 

In [None]:
retriever ## INPUT: Query, RESULT: List of Documents

VectorStoreRetriever(tags=['FAISS', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7ec648c0f5d0>, search_kwargs={'k': 4})

### Augmentation

In [None]:
!pip install langchain_google_genai



In [None]:
os.environ["GOOGLE_API_KEY"] = userdata.get('GEMINI_KEY')

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI


llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.9)
prompt = "Explain what ChatGoogleGenerativeAI function do in langchain_google_genai."
response = llm.invoke(prompt)
print(response.content)


The `ChatGoogleGenerativeAI` class in `langchain_google_genai` is a Langchain integration for interacting with Google's Generative AI models, specifically those designed for conversational (chat) interactions.  It's a wrapper around the Google Generative AI SDK that makes it easy to build conversational AI applications within the Langchain ecosystem.

Here's a breakdown of what it does and its key features:

**Core Functionality:**

* **Connects to Google's Chat Models:** The primary function is to establish a connection to Google's Generative AI API, allowing you to send chat-based prompts and receive responses from models like Gemini Pro. It handles the authentication and communication with the Google Cloud backend.

* **Chat-Focused Interface:** Unlike models designed for single-shot text generation, `ChatGoogleGenerativeAI` is optimized for multi-turn conversations. It maintains a conversation state (through the underlying Gemini API) and can understand context from previous turns.

In [None]:
prompt = PromptTemplate(
    template = """
      You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [None]:
question = "is the topic of aliens discussed in the video? If yes, then what was discussed?"

In [None]:
retrieved_docs = retriever.invoke(question)

In [None]:
retrieved_docs

[Document(id='1ed3c3b8-9c29-40c1-abb2-a20716d62147', metadata={}, page_content="thoughts it could be some interactions with our mind that we think are originating from us is actually something that uh is coming from other life forms elsewhere consciousness itself might be that it could be but i don't see any sensible argument to the why why would all of the alien species be using this way yes some of them will be more primitive they would be close to our level you know there would there should be a whole sort of normal distribution of these things right some would be aggressive some would be you know curious others would be very stoical and philosophical because you know maybe they're a million years older than us but it's not it shouldn't be like what i mean one one alien civilization might be like that communicating thoughts and others but i don't see why you know potentially the hundreds there should be would be uniform in this way right it could be a violent dictatorship that the t

In [None]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)

In [None]:
final_prompt = prompt.invoke({'context': context_text, 'question': question})

### Generation

In [None]:
answer = llm.invoke(final_prompt)

In [None]:
print(answer.content)

Yes, the topic of aliens is discussed in the video.
The discussion includes:

*   The possibility of alien civilizations communicating through our thoughts or consciousness.
*   The lack of evidence for alien civilizations, such as Dyson spheres or other signs of advanced technology.
*   The idea that there may be a universal rule not to interfere with primitive species like humans.
*   The possibility that we are alone in the universe.
*   The possibility that the way aliens communicate is fundamentally different and we are too dumb to understand.
*   The possibility of safari view, where we are a primitive species and there's some kind of universal rule not to interfere.


### Building a Chain

In [None]:
## call invoke once, and the entire pipeline runs
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda

In [None]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [None]:
parallel_chain = RunnableParallel( ## Dictionary as input
    {
        'context': retriever | RunnableLambda(format_docs), ## output is context string
        'question': RunnablePassthrough()
    }
)

In [None]:
parallel_chain.invoke('who is Demis?')

{'context': "demas establish to support this podcast please check out our sponsors in the description and now let me leave you with some words from edskar dykstra computer science is no more about computers than astronomy is about telescopes thank you for listening and hope to see you next time\n\nthe following is a conversation with demus hasabis ceo and co-founder of deepmind a company that has published and builds some of the most incredible artificial intelligence systems in the history of computing including alfred zero that learned all by itself to play the game of gold better than any human in the world and alpha fold two that solved protein folding both tasks considered nearly impossible for a very long time demus is widely considered to be one of the most brilliant and impactful humans in the history of artificial intelligence and science and engineering in general this was truly an honor and a pleasure for me to finally sit down with him for this conversation and i'm sure we 

In [None]:
from langchain_core.output_parsers import StrOutputParser
parser = StrOutputParser()

In [None]:
main_chain = parallel_chain | prompt | llm | parser

In [None]:
main_chain.invoke('Can you summarise the video?')

'The video discusses the beginning of an explanation that would encompass many mysteries, such as consciousness, life, and gravity. It also touches on testing AI capabilities on a range of tasks to see if it reaches human level or above performance, and the generalizability across multiple tasks.'