In [None]:
!pip install youtube-transcript-api # YouTube lib to get transcript
!pip install pytube                 # YouTube lib
!pip install openai                 # OpenAI GPT
!pip install transformers           # Used to know the token size
!pip install chromadb
!pip install python-dotenv

In [None]:
!git clone https://github.com/hwchase17/langchain.git

In [None]:
from dotenv import dotenv_values
api_keys = dotenv_values('keys.txt')

In [None]:
from langchain.llms import OpenAI
llm = OpenAI(openai_api_key=api_keys['OPENAI_API_KEY'])

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)

In [None]:
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from langchain.document_loaders import YoutubeLoader

In [None]:
loader = YoutubeLoader.from_youtube_url("https://www.youtube.com/watch?v=FtyKgybYkaE", add_video_info=False)
result = loader.load()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)

docs = text_splitter.split_documents(result)
len(docs)

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
db = Chroma.from_documents(docs, embeddings, persist_directory='db')

In [None]:
query = "What's the movie at this years Devoxx Belgium?"

In [None]:
docs = db.similarity_search(query)
len(docs)

In [None]:
from langchain.chains.question_answering import load_qa_chain

chain = load_qa_chain(llm, chain_type="stuff")
chain.run(input_documents=docs, question=query)

In [None]:
#
# Load QnA 
#
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

chain = load_qa_with_sources_chain(llm, chain_type="stuff")

chain({"input_documents": docs, "question": query}, return_only_outputs=True)

In [89]:
#
# Load QnA with intermediate steps & map rerank
#
# Map-Rerank:
# This method involves running an initial prompt on each chunk of data, 
# that not only tries to complete a task but also gives a score for how certain it is in its answer. 
# The responses are then ranked according to this score, and the HIGHEST SCORE IS RETURNED.
#
chain = load_qa_with_sources_chain(
    OpenAI(temperature=0), 
    chain_type="map_rerank", 
    metadata_keys=['source'], 
    return_intermediate_steps=True
)

result = chain({"input_documents": docs, "question": query}, return_only_outputs=True)

result["intermediate_steps"]

[{'answer': ' This document does not answer the question.', 'score': '0'},
 {'answer': ' This document does not answer the question.', 'score': '0'},
 {'answer': ' Top Gun: Maverick', 'score': '100'}]

In [90]:
len(docs)

3

In [91]:
result

{'source': 'FtyKgybYkaE',
 'intermediate_steps': [{'answer': ' This document does not answer the question.',
   'score': '0'},
  {'answer': ' This document does not answer the question.', 'score': '0'},
  {'answer': ' Top Gun: Maverick', 'score': '100'}],
 'output_text': ' Top Gun: Maverick'}