### Pipeline for QnA with Memory

In [None]:
####Notes####
# chunked data length can be atmost 16 for azureopenai

In [None]:
# !pip3 install langchain
# !pip install openai
# !pip3 install openai chromadb
# !pip3 install tiktoken

In [None]:
# !pip3 install unstructured
# !pip3 install pdf2image
# !pip3 install pdfminer
# !pip3 install pdfminer.six
# !pip3 install pymupdf 

In [None]:
import os
import sys
import glob
import re
import importlib
import langchain
from langchain.document_loaders import WebBaseLoader, UnstructuredPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import AzureOpenAI
from langchain.prompts import PromptTemplate
sys.path.append(os.path.join(os.getcwd(), '../scripts'))
import helpers as h
import constants as c
importlib.reload(h)
importlib.reload(c)

#### Environment

In [None]:
os.environ["OPENAI_API_KEY"] = "6cdb659e5a9d402e80c212fe8ea26483"

In [None]:
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview"
os.environ["OPENAI_API_BASE"] = "https://test-chatgpt-flomoney.openai.azure.com/"
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

#### Paths

In [None]:
pdfs_path = os.path.join(os.getcwd(), '../data/pdfs')
merged_pdfs_path = os.path.join(os.getcwd(), '../data/pdfs_processed/merged')

#### PDF

In [None]:
investorcom_pdfs = [ _ for _ in glob.glob(os.path.join(pdfs_path, '*.pdf')) if 'investorcom' in _]

##### - Load

In [None]:
loaded_data = []
for pdf in investorcom_pdfs :
    loaded_data.extend(UnstructuredPDFLoader(file_path=pdf).load())        

In [None]:
len(loaded_data)

In [None]:
len(loaded_data[0].page_content)

In [None]:
# for pdf in pdf_list :
#     try :
#         pdf_data.extend(UnstructuredPDFLoader(file_path=pdf).load())        
#     except NameError:
#         pdf_loader = UnstructuredPDFLoader(file_path=pdf)
#         pdf_data = pdf_loader.load()

##### - Split

In [None]:
chunk_size = c.prompt_max//c.retrieval_kwargs['k']
chunk_overlap = 0
print(chunk_size)

In [None]:
data_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size, chunk_overlap = chunk_overlap
)

In [None]:
chunked_data = data_splitter.split_documents(loaded_data)

In [None]:
len(chunked_data)

In [None]:
len(chunked_data[4].page_content)

##### - Store

In [None]:
chunks_max = 15

In [None]:
vectorstore_engine = 'Finbot-embedding-2'

In [None]:
embedding_model = OpenAIEmbeddings(deployment=vectorstore_engine)

In [None]:
try:
    vectorstore.delete_collection()
except (ValueError, NameError):
    pass
for _ in range(0, len(chunked_data), chunks_max):
    vectorstore = Chroma.from_documents(
        documents=chunked_data[_: _ + chunks_max], embedding=embedding_model
    )

In [None]:
len(vectorstore.get()['documents'])

In [None]:
vectorstore.get().keys()

##### - Retrieve

In [None]:
question = "what are the pros and cons of tesla stock and nvidia stock?"

In [None]:
# similarity_search_with_relevance_scores : normalizes scores between 0 and 1. 
# higher score means more similar
docs = vectorstore.similarity_search_with_relevance_scores(
    question, search_type="similarity_score_threshold", score_threshold= 0.7
)
len(docs)

In [None]:
docs[0][1]

In [None]:
# similarity_search_with_score
# lower score means more similar (less distance)
docs = vectorstore.similarity_search_with_score(question, k = 4)
len(docs)

In [None]:
docs[0][1]

In [None]:
# mmr

In [None]:
docs_mmr = vectorstore.max_marginal_relevance_search(
    question, **c.retrieval_kwargs
)

In [None]:
docs_mmr[0].metadata

In [None]:
# print(docs[1][0].page_content)

##### - Generate

In [None]:
llm_engine = 'finbot-gpt'
llm_model = 'text-davinci-002'
temperature = 0
search_type = 'mmr'
retrieval_kwargs = {'k': 5, 'lambda_mult': 0.5, 'fetch_k':10}
answer_max_tokens = 512
save_folder = '../data/pdfs/'
prompt_template_file = os.path.join(os.getcwd(), '../scripts/prompt_template.txt')
pdf_list = [
    _ for _ in glob.glob(os.path.join(os.getcwd(), save_folder, '*.pdf'))
]
web_list = []
langchain.debug=False

In [None]:
############

In [None]:
base_llm = AzureOpenAI(
    engine=llm_engine, 
    model_name=llm_model, 
    temperature=temperature, 
    max_tokens=answer_max_tokens
)

In [None]:
langchain.debug=False

In [None]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
    HumanMessage,
    SystemMessage    
)
from langchain.memory import ConversationSummaryBufferMemory, ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain, LLMChain

In [None]:
chat_messages = [
    HumanMessage(content="Answer question using the context and chat history"),
    MessagesPlaceholder(variable_name="chat_history"),
    HumanMessagePromptTemplate.from_template("Question: {question}"),
]

In [None]:
qna_messages = [
    SystemMessagePromptTemplate.from_template(
        "You are a chatbot who answers questions like a {role}"
    ),
    HumanMessage(content="Answer question using the following context"),
    HumanMessagePromptTemplate.from_template("{context}"),
    HumanMessagePromptTemplate.from_template("Question: {question}"),
    HumanMessage(content="Return answers in a bullet format"),
]

In [None]:
chat_prompt = ChatPromptTemplate(messages=chat_messages)

In [None]:
qna_prompt = ChatPromptTemplate(
    messages=qna_messages, partial_variables={'role':'finanical advisor'}
)

In [None]:
history_tokens = 2000
memory = ConversationSummaryBufferMemory(
    llm=base_llm, 
    memory_key="chat_history", 
    return_messages=True, 
    max_token_limit=history_tokens
)

In [None]:
conversation_chain = ConversationalRetrievalChain.from_llm(
    base_llm, 
    retriever=vectorstore.as_retriever(
        search_type=search_type, search_kwargs=retrieval_kwargs
    ),
    memory=memory,
#     condense_question_prompt=chat_prompt,
    combine_docs_chain_kwargs=dict(prompt=qna_prompt),
)

In [None]:
question = "should i invest in nvidia over google"
qna_chain = conversation_chain({"question": question})

In [None]:
qna_chain

In [None]:
conversation_chain.memory

In [None]:
#######

In [None]:
chunk_retriever=vectorstore.as_retriever(
    search_type=search_type, search_kwargs=retrieval_kwargs
)

In [None]:
relevant_chunks = chunk_retriever.get_relevant_documents(query=question)

In [None]:
# chunk_retriever.aget_relevant_documents(query=question)

In [None]:
import constants as c

In [None]:
chain_prompt = PromptTemplate.from_file(
    prompt_template_file,
    input_variables=c.prompt_input_variables,
    partial_variables={'role':c.prompt_role}
)

In [None]:
qna_chain = RetrievalQA.from_chain_type(
    base_llm, 
    retriever=vectorstore.as_retriever(
        search_type=search_type, search_kwargs=retrieval_kwargs
    ),
    return_source_documents=True,  
#     verbose=True,
    chain_type_kwargs={"prompt": None}
)

In [None]:
# qna_chain.verbose=True

In [None]:
qna_chain.combine_documents_chain.llm_chain.prompt

In [None]:
query_result = qna_chain({"query": question})

In [None]:
query_result.keys()

In [None]:
print(query_result['result'])

In [None]:
print(query_result['source_documents'][0].metadata)

In [None]:
print(query_result['result'])

In [None]:
source_doc_length = 0
for doc in query_result['source_documents']:
    source_doc_length += len(doc.page_content)
source_doc_length    


In [None]:
query_result['query']

In [None]:
print(query_result['source_documents'][1].page_content)

In [None]:
query_result['result']

In [None]:
query_result['source_documents'][0].metadata

In [None]:
len(query_result['source_documents'][0].page_content)

In [None]:
########

In [None]:
qna_chain = RetrievalQA.from_chain_type(
    base_llm, 
    retriever=vectorstore.as_retriever(
        search_type=search_type, search_kwargs=retrieval_kwargs
    )
)