In [None]:
import os
os.environ["OPENAI_API_KEY"] = ''

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate

In [None]:
# Step 1 a - Doc Loader / Loading Docs...
video_id = "Gfr50f6ZBvo"

try:
    transcripts_list = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
    transcript       = " ".join(chunk['text'] for chunk in transcripts_list)
    print(transcript)
except TranscriptsDisabled:
    print("No captions available for this video.")

In [None]:
# Step 1 b - Splitter / splitting the loaded docs
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
chunks   = splitter.create_documents([transcript])

len(chunks)
chunks[100]

In [None]:
# Step 1 c , 1 d - Embedd and store
embeddings = OpenAIEmbeddings(model = "text-embedding-3-small")
vector_store = FAISS.from_documents(embedding = embeddings, documents = chunks)

vector_store.index_to_docstore_id
vector_store.get_by_ids(['2436bdb8-3f5f-49c6-8915-0c654c888700'])

In [None]:
# Step 2 - Retrieval
retriever = vector_store.as_retriever(search_type = "similarity", search_kwargs = {"k" : 4})
retriever
retriever.invoke("What is happening here?")

In [None]:
# Step 3 - Agumentation
llm = ChatOpenAI(model = "gpt-4o-mini", temperature = 0.2)
prompt = PromptTemplate(template = """
      You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """, input_variables = ['context', 'question'])
question = "is the topic of nuclear fusion discussed in this video? if yes then what was discussed"
retrieved_docs = retriever.invoke(question)
retrieved_docs

context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text
final_prompt = prompt.invoke({"context" : context_text, "question" : question})
final_prompt

In [None]:
# Step 4 - Generation

answer = llm.invoke(final_prompt)
print(answer.content)

In [None]:
# ALL this with chains...

from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

def format_docs(retrieved_docs):
    context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
    return context_text

parallel_chain = RunnableParallel({'context' : retriever | RunnableLambda(format_docs), 'question' : RunnablePassthrough()})
parser = StrOutputParser()
main_chain = parallel_chain | prompt | llm | parser
result = main_chain.invoke('Can you summarize the video')