In [61]:
from langchain_community.document_loaders import YoutubeLoader

In [3]:
loader = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=mrKuDK9dGlg&list=PLrLEqwuz-mRIEtuUEN8sse2XyksKNN4Om", add_video_info=False,
    language=["en", "id"],
    translation="en",
)
d = loader.load()
d[0].page_content



In [4]:
d



In [62]:
from youtube_transcript_api import YouTubeTranscriptApi

In [63]:
from pydantic import BaseModel
from dotenv import load_dotenv
import google.generativeai as genai
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma  # chroma for storing vector store locally
from langchain_community.embeddings import HuggingFaceBgeEmbeddings # for converting text to embedings
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
load_dotenv()
api =os.getenv("GOOGLE_API_KEY") #uncomment this during locally
genai.configure(api_key=api) 

In [64]:
load_dotenv()

True

In [50]:
pinecone_api= os.getenv("PINECONE_API_KEY")

In [65]:
def get_transcript(url:str):
     """
     Try to get the transcription of the yt video
     input: url
     output: yt_transcription
     """
     try:
        loader = YoutubeLoader.from_youtube_url(
            url, add_video_info=False,
        language=["en", "hi"],
        translation="en",
        )
        doc = loader.load()
        return doc
     except:
        return "not able to get transcript"

In [66]:
def yt_transcript(video_id):
    try:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        for transcript in transcript_list:
            transcript_text =transcript.translate('en').fetch()
        return transcript_text

    except Exception as e:
        raise e


In [69]:
trans_from_get = get_transcript("https://www.youtube.com/watch?v=btuN-rrPhsM&t=2465s")
trans_from_get

[Document(page_content="hello everyone welcome to AI anytime  channel in this video we are going to  explore a new large language model  called Jer 7B beta so Jer 7B beta it's a  new release uh by hugging face  H4 uh they also created uh a model  earlier that's that was the Alpha  version of this particular model that  that was named Jer 7B Alpha I do have a  video on that as well please check that  out uh the video title is Jer 7B with  with chain lit okay so in that time we  we just uh inference the model uh in a  chain lit application just to evaluate  on some queries that have it's  performing now in this video we will use  j4 7B beta but not for the uh the stand  alone inference but we going to build a  rag uh system so we going to implement  retrieval augmented gen generation using  jeer 7B  beta uh let's talk about jeer 7B beta  you can see here on I am currently on  their uh hugging F repository and I like  hugging fish H4 who are the creators of  this particular model because 

In [70]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents=trans_from_get)

In [71]:
texts

[Document(page_content="hello everyone welcome to AI anytime  channel in this video we are going to  explore a new large language model  called Jer 7B beta so Jer 7B beta it's a  new release uh by hugging face  H4 uh they also created uh a model  earlier that's that was the Alpha  version of this particular model that  that was named Jer 7B Alpha I do have a  video on that as well please check that  out uh the video title is Jer 7B with  with chain lit okay so in that time we  we just uh inference the model uh in a", metadata={'source': 'btuN-rrPhsM'}),
 Document(page_content="time we  we just uh inference the model uh in a  chain lit application just to evaluate  on some queries that have it's  performing now in this video we will use  j4 7B beta but not for the uh the stand  alone inference but we going to build a  rag uh system so we going to implement  retrieval augmented gen generation using  jeer 7B  beta uh let's talk about jeer 7B beta  you can see here on I am currently on  th

In [73]:
from langchain.vectorstores import FAISS

In [36]:
trans =yt_transcript("btuN-rrPhsM'")

In [39]:
trans[0]["duration"]

6.0

In [14]:
class CheckURL(BaseModel):
    url: str
class CheckSearchInput(BaseModel):
    textInput: str

In [74]:
model_name = "BAAI/bge-large-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


In [75]:
v_store = FAISS.from_documents(documents=texts,embedding=embeddings)

In [78]:
prompt_template = """Use the following  text transcript to answer the user's question in detail.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    Context: {context}
    Question: {question}

    Only return the  answer below and nothing else.
    Detailed answer:
    """

In [79]:
prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])

In [80]:
model = ChatGoogleGenerativeAI(model="gemini-pro",
                            temperature=0.2,convert_system_message_to_human=True,google_api_key=api)
    
chain_type_kwargs = {"prompt": prompt}
qa = RetrievalQA.from_chain_type(
llm=model,
chain_type="stuff",
retriever=v_store.as_retriever(),
return_source_documents = True,
chain_type_kwargs= chain_type_kwargs,
verbose=True
)


In [83]:
qa("what is zehyer 7b model")["result"]



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'Zehyer 7b is a new large language model released by hugging face. It is a 7B parameter model that has been fine-tuned on a variety of tasks, including text generation, translation, and question answering.'

In [4]:
def validate_url(url):
    """
    validate if transcription is available.If available convert the transcript to embendings
    and store it in croma vector store
    input: url
    output: response
    """
    doc = get_transcript(url)
    if doc!="not able to get transcript":
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        texts = text_splitter.split_text(text=doc[0].page_content)
        vector_index = Chroma.from_texts(texts, embeddings, collection_metadata={"hnsw:space": "cosine"}).as_retriever(search_kwargs={"k":1})
    else:
        return None
    return vector_index


In [18]:
vector = validate_url("https://www.youtube.com/watch?v=btuN-rrPhsM&t=2465s")

In [19]:
def yt_search(textInput):
    """
    get the output using vecotr and llm
    input: text
    output: response
    """
    prompt_template = """Use the following pieces of information to answer the user's question.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    Context: {context}
    Question: {question}

    Only return the helpful answer below and nothing else.
    Helpful answer:
    """
    prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
    load_vector_store = Chroma(persist_directory="stores/yt_cosine", embedding_function=embeddings)
    retriever = load_vector_store.as_retriever(search_kwargs={"k":1})
    
    model = ChatGoogleGenerativeAI(model="gemini-pro",
                            temperature=0.2,convert_system_message_to_human=True,google_api_key=api)
    
    chain_type_kwargs = {"prompt": prompt}
    qa = RetrievalQA.from_chain_type(
    llm=model,
    chain_type="stuff",
        retriever=retriever,
        return_source_documents = True,
        chain_type_kwargs= chain_type_kwargs,
        verbose=True
    )

    response = qa(textInput)

    print(response)
    return response

In [20]:
res = yt_search("what is pet document")


  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
{'query': 'what is pet document', 'result': 'I do not have the answer to your question.', 'source_documents': []}
