In [1]:
# %pip install --upgrade --quiet youtube-transcript-api langchain_community
# !pip install pytube
# !pip install langchain_huggingface
# !pip install -qU faiss-cpu

## Step-1 :
## Load the Youtube Transcripts based on TimeStamp Chunks

In [2]:
from langchain_community.document_loaders import YoutubeLoader # Load the Youtube Transcript
from langchain_community.document_loaders.youtube import TranscriptFormat # To Get transcripts as timestamped chunks
from dotenv import load_dotenv
load_dotenv()


True

In [3]:

try:
    loader = YoutubeLoader.from_youtube_url(
        "https://www.youtube.com/watch?v=pJdMxwXBsk0&list=PLKnIA16_RmvaTbihpo4MtzVm4XOQa0ER0&index=15",
         language=["hi"],
         translation="en",
        transcript_format=TranscriptFormat.CHUNKS,
        chunk_size_seconds=60,
    )
    docs = loader.load()

    if docs:
        print(f"Successfully loaded {len(docs)} transcript chunks")
        print(f"First chunk content: {docs[0].page_content[:200]}...")  # Show first 200 chars
        print(f"First chunk metadata: {docs[0].metadata}")
    else:
        print("No transcript data was loaded (empty result)")

except Exception as e:
    print(f"Error loading YouTube transcript: {str(e)}")
    docs = None  # or [] if you prefer an empty list instead of None

# You can now check if docs exists before proceeding
if docs:
    # Continue with your processing
    pass
else:
    print("Failed to load transcript, cannot proceed")

Successfully loaded 52 transcript chunks
First chunk content: Hi guys, my name is Nitesh and welcome to my YouTube channel.  In this video also we will continue our lang chain playlist. And the topic of today's video is retrievers which is a very important topic...
First chunk metadata: {'source': 'https://www.youtube.com/watch?v=pJdMxwXBsk0&t=0s', 'start_seconds': 0, 'start_timestamp': '00:00:00'}


In [4]:
len(docs)

52

In [5]:
docs[0]

Document(metadata={'source': 'https://www.youtube.com/watch?v=pJdMxwXBsk0&t=0s', 'start_seconds': 0, 'start_timestamp': '00:00:00'}, page_content="Hi guys, my name is Nitesh and welcome to my YouTube channel.  In this video also we will continue our lang chain playlist. And the topic of today's video is retrievers which is a very important topic.  If you talk about rag.  If you want to build a RAG based application then retriever is a very important component.  In fact, in the future when you make some advanced rag systems, you will work with different types of retrievers there, so in that sense this particular video is very important. And I would like you to watch this video end to end.  So in today's video I will not only explain to you what are retrievers?  What do they need?  But at the same time I will also tell you about different types of retrievers and will show you the code.  Ok?  So ya let's start the video.  So guys, before we start the video, I would like to give you a quic

In [6]:
docs[2]

Document(metadata={'source': 'https://www.youtube.com/watch?v=pJdMxwXBsk0&t=120s', 'start_seconds': 120, 'start_timestamp': '00:02:00'}, page_content="how we are moving forward with this playlist. Now let's focus on today's video, which is on retrievers. Retrievers are very important in langche.  So we will cover this in great detail in today's video.  First of all, we will start with this discussion that what are retrievers?  So in very simple words, if you read this first line, it is written here that a retriever is a component in the language that fetches relevant documents from a data source in response to a user's query.  Ok?  If you focus on this diagram, you will understand things better visually. So what happens is that you have a data source where all your data is stored.  Ok ?  All the data related to anything is stupid.  Now this data source can be anything.  It could be a vector store and it could be some API or something.")

In [7]:
index= 0
print(docs[index].metadata)
print(docs[index].page_content)

{'source': 'https://www.youtube.com/watch?v=pJdMxwXBsk0&t=0s', 'start_seconds': 0, 'start_timestamp': '00:00:00'}
Hi guys, my name is Nitesh and welcome to my YouTube channel.  In this video also we will continue our lang chain playlist. And the topic of today's video is retrievers which is a very important topic.  If you talk about rag.  If you want to build a RAG based application then retriever is a very important component.  In fact, in the future when you make some advanced rag systems, you will work with different types of retrievers there, so in that sense this particular video is very important. And I would like you to watch this video end to end.  So in today's video I will not only explain to you what are retrievers?  What do they need?  But at the same time I will also tell you about different types of retrievers and will show you the code.  Ok?  So ya let's start the video.  So guys, before we start the video, I would like to give you a quick recap of what we have been doin

## Step-2
## Loading the embedding model and the llm

In [8]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-mpnet-base-v2")
from langchain_huggingface import ChatHuggingFace,HuggingFaceEndpoint
from transformers import AutoTokenizer
# Initialize a llm model
repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
# First load the tokenizer explicitly
tokenizer = AutoTokenizer.from_pretrained(repo_id)
llm = HuggingFaceEndpoint(
    repo_id = repo_id,
    temperature = 0.8,
    max_new_tokens=500,
)
model = ChatHuggingFace(llm=llm,tokenizer=tokenizer)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# !pip install langchain_groq

In [11]:
from langchain_groq import ChatGroq
llm = ChatGroq(model_name = "Llama-3.3-70b-Versatile",max_tokens= 500)

## Step-3
 
 ## Creating a vectordatabase using the Chroma db


In [12]:
# vector_store = FAISS.from_documents(docs, embedding_model)
from langchain_chroma import Chroma
vectorstore = Chroma.from_documents(docs, embedding_model)

In [13]:
# !pip install langchain-chroma
# !pip install lark

## Step-4 Defining the retriever
## Using the Metadatabased Filtering for retrievers

#### -> this retriever is known as self-query retriever

In [14]:
print(docs[index].metadata)


{'source': 'https://www.youtube.com/watch?v=pJdMxwXBsk0&t=0s', 'start_seconds': 0, 'start_timestamp': '00:00:00'}


In [15]:
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The link of the video",
        type="string"
    ),
    AttributeInfo(
        name="start_seconds",
        description="The starting second of the video chunk (in seconds as integer)",
        type="integer"  # Changed from string to integer
    ),
    AttributeInfo(
        name="start_timestamp",
        description="Human-readable timestamp (HH:MM:SS format)",
        type="string"
    )
]

In [None]:
# # First get the base retriever from your vectorstore with increased k
# base_vectorstore_retriever = vectorstore.as_retriever(
#     # search_type = "mmr",
#     search_kwargs={"k": 20,'lambda_mult':0.5}  # Increase this number as needed
# )

In [23]:
document_content_description = "Transcript of a youtube video"
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    # base_retriever = base_vectorstore_retriever,
    verbose=True,
    search_kwargs={"k": 8}  # Increase this number as needed

)

In [24]:
# This example only specifies a filter
retriever.invoke("Explain me in short the summary of the video ?")
# retriever.invoke("what is meant by multi query retriever ?")

[Document(id='ca167079-0ec5-4f74-8cc9-cca59cc4ee33', metadata={'source': 'https://www.youtube.com/watch?v=pJdMxwXBsk0&t=3060s', 'start_seconds': 3060, 'start_timestamp': '00:51:00'}, page_content='application.  Ok?  So with that I will conclude this video.  If you liked the video, please like it.  If you have not subscribed to this channel, please do subscribe.  See you in the next video , bye.'),
 Document(id='60f6adc0-1eb7-47b8-82a7-688339c7a742', metadata={'source': 'https://www.youtube.com/watch?v=pJdMxwXBsk0&t=120s', 'start_seconds': 120, 'start_timestamp': '00:02:00'}, page_content="how we are moving forward with this playlist. Now let's focus on today's video, which is on retrievers. Retrievers are very important in langche.  So we will cover this in great detail in today's video.  First of all, we will start with this discussion that what are retrievers?  So in very simple words, if you read this first line, it is written here that a retriever is a component in the language tha

## Step 5 Creating a rag_chain