### Pipeline for QnA with Memory

In [1]:
####Notes####
# chunked data length can be atmost 16 for azureopenai

In [2]:
# !pip3 install langchain
# !pip install openai
# !pip3 install openai chromadb
# !pip3 install tiktoken

In [3]:
# !pip3 install unstructured
# !pip3 install pdf2image
# !pip3 install pdfminer
# !pip3 install pdfminer.six
# !pip3 install pymupdf 

In [4]:
import os
import sys
import glob
import re
import importlib
import langchain
from langchain.document_loaders import WebBaseLoader, UnstructuredPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import AzureOpenAI
from langchain.prompts import PromptTemplate
sys.path.append(os.path.join(os.getcwd(), '../scripts'))
import helpers as h
import constants as c
importlib.reload(h)
importlib.reload(c)

<module 'constants' from '/Users/rachitjoshi/Documents/Finbot/notebooks/../scripts/constants.py'>

#### Environment

In [5]:
os.environ["OPENAI_API_KEY"] = "6cdb659e5a9d402e80c212fe8ea26483"

In [6]:
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview"
os.environ["OPENAI_API_BASE"] = "https://test-chatgpt-flomoney.openai.azure.com/"
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

#### Paths

In [7]:
pdfs_path = os.path.join(os.getcwd(), '../data/pdfs')
merged_pdfs_path = os.path.join(os.getcwd(), '../data/pdfs_processed/merged')

#### PDF

In [8]:
investorcom_pdfs = [ _ for _ in glob.glob(os.path.join(pdfs_path, '*.pdf')) if 'investorcom' in _]

##### - Load

In [9]:
loaded_data = []
for pdf in investorcom_pdfs :
    loaded_data.extend(UnstructuredPDFLoader(file_path=pdf).load())        

In [10]:
len(loaded_data)

7

In [11]:
len(loaded_data[0].page_content)

6702

In [12]:
# for pdf in pdf_list :
#     try :
#         pdf_data.extend(UnstructuredPDFLoader(file_path=pdf).load())        
#     except NameError:
#         pdf_loader = UnstructuredPDFLoader(file_path=pdf)
#         pdf_data = pdf_loader.load()

##### - Split

In [13]:
chunk_size = c.prompt_max//c.retrieval_kwargs['k']
chunk_overlap = 0

In [14]:
data_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size, chunk_overlap = chunk_overlap
)

In [15]:
chunked_data = data_splitter.split_documents(loaded_data)

In [16]:
len(chunked_data)

7

In [17]:
len(chunked_data[4].page_content)

7388

##### - Store

In [18]:
chunks_max = 15

In [19]:
vectorstore_engine = 'Finbot-embedding-2'

In [20]:
embedding_model = OpenAIEmbeddings(deployment=vectorstore_engine)

In [21]:
try:
    vectorstore.delete_collection()
except (ValueError, NameError):
    pass
for _ in range(0, len(chunked_data), chunks_max):
    vectorstore = Chroma.from_documents(
        documents=chunked_data[_: _ + chunks_max], embedding=embedding_model
    )

In [22]:
len(vectorstore.get()['documents'])

7

In [23]:
vectorstore.get().keys()

dict_keys(['ids', 'embeddings', 'metadatas', 'documents'])

##### - Retrieve

In [24]:
question = "what are the pros and cons of tesla stock and nvidia stock?"
docs = vectorstore.similarity_search_with_relevance_scores(question, k=7)
len(docs)

7

In [25]:
len(docs[0][0].page_content)

9214

In [26]:
# print(docs[1][0].page_content)

##### - Generate

In [27]:
llm_engine = 'finbot-gpt'
llm_model = 'text-davinci-002'
temperature = 0
search_type = 'mmr'
retrieval_kwargs = {'k': 5, 'lambda_mult': 0.5, 'fetch_k':10}
answer_max_tokens = 512
save_folder = '../data/pdfs/'
prompt_template_file = os.path.join(os.getcwd(), '../scripts/prompt_template.txt')
pdf_list = [
    _ for _ in glob.glob(os.path.join(os.getcwd(), save_folder, '*.pdf'))
]
web_list = []
langchain.debug=False

In [28]:
############

In [29]:
base_llm = AzureOpenAI(
    engine=llm_engine, 
    model_name=llm_model, 
    temperature=temperature, 
    max_tokens=answer_max_tokens
)

                engine was transferred to model_kwargs.
                Please confirm that engine is what you intended.


In [30]:
langchain.debug=True

In [31]:
from langchain.memory import ConversationSummaryBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [32]:
condense_question_template = os.path.join(
    os.getcwd(), '..', 'scripts/condense_question_template.txt'
)
condense_question_input = c.condense_question_input

In [33]:
chat_prompt = PromptTemplate.from_file(
    condense_question_template,
    input_variables=condense_question_input
)

In [34]:
qna_prompt_template = os.path.join(os.getcwd(), '..', 'scripts/qna_prompt_template.txt')
qna_prompt_input = c.qna_prompt_input
prompt_role = c.prompt_role

In [35]:
qna_prompt = PromptTemplate.from_file(
    qna_prompt_template,
    input_variables=qna_prompt_input,
    partial_variables={'role':prompt_role}
)

In [36]:
history_tokens = 2000
memory = ConversationSummaryBufferMemory(
    llm=base_llm, 
    memory_key="chat_history", 
    return_messages=True, 
    max_token_limit=history_tokens
)

In [37]:
qna_chain = ConversationalRetrievalChain.from_llm(
    base_llm, 
    retriever=vectorstore.as_retriever(
        search_type=search_type, search_kwargs=retrieval_kwargs
    ),
    memory=memory,
    condense_question_prompt=chat_prompt,
    combine_docs_chain_kwargs=dict(prompt=qna_prompt)
)

In [38]:
question = "what are the pros and cons of investing in meta stock over nvidia stock"
results = qna_chain({"question": question})

[32;1m[1;3m[chain/start][0m [1m[1:chain:ConversationalRetrievalChain] Entering Chain run with input:
[0m{
  "question": "what are the pros and cons of investing in meta stock over nvidia stock",
  "chat_history": []
}


Number of requested results 10 is greater than number of elements in index 7, updating n_results = 7


[32;1m[1;3m[chain/start][0m [1m[1:chain:ConversationalRetrievalChain > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:ConversationalRetrievalChain > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "what are the pros and cons of investing in meta stock over nvidia stock",
}
[32;1m[1;3m[llm/start][0m [1m[1:chain:ConversationalRetrievalChain > 3:chain:StuffDocumentsChain > 4:chain:LLMChain > 5:llm:AzureOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
  ]
}


[36;1m[1;3m[llm/end][0m [1m[1:chain:ConversationalRetrievalChain > 3:chain:StuffDocumentsChain > 4:chain:LLMChain > 5:llm:AzureOpenAI] [14.94s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " Meta stock has rallied 170% so far in 2023, second only to Nvidia (NVDA) in the S&P 500. Investors have good reason to be excited about the outlook, and there are some potential near-term catalysts that might continue to fuel Meta stock. The company is hosting a Sept. 27 conference to reveal more about its AI and metaverse initiatives. The event will feature the launch of Meta's new Quest 3 mixed reality headset. Still, it wouldn't be surprising to see Meta stock's fantastic run take a breather. While there are definitely reasons to be excited about the impact of generative AI on Meta results, that's more of a long-term story. A closer look at the earnings turnaround also suggests Meta's earnings momentum may be somewhat less than meets the eye. That's onl

In [39]:
print(results['answer'])

 Meta stock has rallied 170% so far in 2023, second only to Nvidia (NVDA) in the S&P 500. Investors have good reason to be excited about the outlook, and there are some potential near-term catalysts that might continue to fuel Meta stock. The company is hosting a Sept. 27 conference to reveal more about its AI and metaverse initiatives. The event will feature the launch of Meta's new Quest 3 mixed reality headset. Still, it wouldn't be surprising to see Meta stock's fantastic run take a breather. While there are definitely reasons to be excited about the impact of generative AI on Meta results, that's more of a long-term story. A closer look at the earnings turnaround also suggests Meta's earnings momentum may be somewhat less than meets the eye. That's only partly because of ongoing expenses on the metaverse that won't pay off anytime soon. Regulation also continues to pose risks. Nvidia, on the other hand, is a giant in data centers and gaming, and it continues to sizzle. The chip gi

In [457]:
qna_chain.memory

ConversationSummaryBufferMemory(human_prefix='Human', ai_prefix='AI', llm=AzureOpenAI(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, client=<class 'openai.api_resources.completion.Completion'>, model_name='text-davinci-002', temperature=0.0, max_tokens=512, top_p=1, frequency_penalty=0, presence_penalty=0, n=1, best_of=1, model_kwargs={'engine': 'finbot-gpt'}, openai_api_key='6cdb659e5a9d402e80c212fe8ea26483', openai_api_base='https://test-chatgpt-flomoney.openai.azure.com/', openai_organization='', openai_proxy='', batch_size=20, request_timeout=None, logit_bias={}, max_retries=6, streaming=False, allowed_special=set(), disallowed_special='all', tiktoken_model_name=None, deployment_name='', openai_api_type='azure', openai_api_version='2023-03-15-preview'), prompt=PromptTemplate(input_variables=['summary', 'new_lines'], output_parser=None, partial_variables={}, template='Progressively summarize the lines of conversation provided, adding onto

In [None]:
#######

In [31]:
chunk_retriever=vectorstore.as_retriever(
    search_type=search_type, search_kwargs=retrieval_kwargs
)

In [32]:
relevant_chunks = chunk_retriever.get_relevant_documents(query=question)

In [33]:
# chunk_retriever.aget_relevant_documents(query=question)

In [35]:
import constants as c

In [36]:
chain_prompt = PromptTemplate.from_file(
    prompt_template_file,
    input_variables=c.prompt_input_variables,
    partial_variables={'role':c.prompt_role}
)

AttributeError: module 'constants' has no attribute 'prompt_role'

In [37]:
qna_chain = RetrievalQA.from_chain_type(
    base_llm, 
    retriever=vectorstore.as_retriever(
        search_type=search_type, search_kwargs=retrieval_kwargs
    ),
    return_source_documents=True,  
#     verbose=True,
    chain_type_kwargs={"prompt": None}
)

In [38]:
# qna_chain.verbose=True

In [39]:
qna_chain.combine_documents_chain.llm_chain.prompt

PromptTemplate(input_variables=['context', 'question'], output_parser=None, partial_variables={}, template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:", template_format='f-string', validate_template=True)

In [66]:
query_result = qna_chain({"query": question})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "what are the pros and cons of tesla stock and nvidia stock?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "what are the pros and cons of tesla stock and nvidia stock?",
}
[32;1m[1;3m[llm/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain > 5:llm:AzureOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
  ]
}


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain > 5:llm:AzureOpenAI] [7.50s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\n\n\n\n# Tesla\n# Pros:\n# - Tesla is a leader in the electric vehicle market, which is a rapidly growing industry.\n# - Tesla has a strong brand and a loyal customer base.\n# - Tesla has a strong balance sheet with a lot of cash on hand.\n# - Tesla has a visionary CEO in Elon Musk who has a track record of innovation and success.\n\n# Cons:\n# - Tesla faces increasing competition in the electric vehicle market from established automakers and new entrants.\n# - Tesla's valuation is very high, which makes it vulnerable to a market downturn or a shift in investor sentiment.\n# - Tesla has faced regulatory scrutiny and legal challenges related to its Autopilot system and other safety issues.\n# - Tesla's gross margins have been declining, which could put pressure on its profit

In [44]:
query_result.keys()

dict_keys(['query', 'result', 'source_documents'])

In [45]:
print(query_result['result'])





# Tesla
# Pros:
# - Tesla is a leader in the electric vehicle market, which is a rapidly growing industry.
# - Tesla has a strong brand and a loyal customer base.
# - Tesla has a strong balance sheet with a lot of cash on hand.
# - Tesla has a visionary CEO in Elon Musk who has a track record of innovation and success.

# Cons:
# - Tesla has a high valuation, which makes it vulnerable to market downturns.
# - Tesla faces increasing competition from other automakers who are investing heavily in electric vehicles.
# - Tesla has had some quality control issues in the past, which could hurt its reputation.
# - Tesla is facing regulatory scrutiny over its Autopilot system, which could lead to increased costs and liability.

# Nvidia
# Pros:
# - Nvidia is a leader in the graphics processing unit (GPU) market, which is a rapidly growing industry.
# - Nvidia has a strong balance sheet with a lot of cash on hand.
# - Nvidia has a strong brand and a loyal customer base.
# - Nvidia is expandi

In [289]:
print(query_result['source_documents'][0].metadata)

{'source': '/Users/rachitjoshi/Documents/Finbot/notebooks/../data/pdfs/tesla_investorcom_1.pdf'}


In [120]:
print(query_result['result'])

 Tesla stock is a high-risk, high-reward investment. The company has a history of volatility and is heavily dependent on the success of its electric vehicles. Nvidia stock is a more stable investment, with a strong track record of growth and a diversified business model. However, it is also subject to the cyclical nature of the semiconductor industry. Ultimately, the decision to invest in either stock will depend on your risk tolerance and investment goals. It is important to do your own research and consult with a financial advisor before making any investment decisions.<|im_end|>


In [119]:
source_doc_length = 0
for doc in query_result['source_documents']:
    source_doc_length += len(doc.page_content)
source_doc_length    


23866

In [120]:
query_result['query']

'what are the pros and cons of tesla stock and nvidia stock?'

In [123]:
print(query_result['source_documents'][1].page_content)

TECHNOLOGY

Is Amazon Stock A Buy As It Restructures Workforce In Tough Times?

BRIAN DEAGON 03:03 PM ET 04/14/2023

F or years, Amazon seemed invincible, an e-commerce giant that made other

companies shiver when it muscled into their markets. It helped Amazon stock soar

into four-digit territory, and the company's earnings reports often delighted

investors.

Now, after a year of troubling earnings reports, Amazon has launched a major

restructuring, including plans to cut 27,000 employees from its workforce. Is

Amazon stock a buy?

In late March, Amazon announced plans to slash 9,000 jobs. That's on top of the

18,000 job cuts it announced in January.

"Given the uncertain economy in which we reside, and the uncertainty that exists in

the near future, we have chosen to be more streamlined in our costs and head

count," Amazon Chief Executive Andy Jassy said in a written notice to employees

when the layoffs were announced. He also suggested more layoffs are possible. The

cuts ma

In [122]:
query_result['result']

" \nTesla stock has been a monster stock over much of its history, especially during its stratospheric run from mid-2019 to late 2021. The stock hit a bear market low of 101.84 on Jan. 6, but roared back until Q1 earnings. Nvidia is seen as one of the biggest winners of the AI boom. Nvidia stock has skyrocketed 206% year to date, after crashing in 2022. Nvidia is a leader in AI chips. In the tech industry's fierce battle for AI dominance, the advanced chips needed for generative AI, such as the ChatGPT chatbot, are key. However, Nvidia stock is not a buy at this time. It's still in a long-term downtrend but has been recovering since it hit a low at the start of the year. Nvidia stock is now forming a cup base with a 114.10 buy point, and the price action is so far pretty smooth. Even if Nvidia breaks out, it has a mountain to climb back. Overhead supply could retard any advance Nvidia tries in the next several months. Tesla stock is greatly extended from a cup base with a 207.79 buy po

In [70]:
query_result['source_documents'][0].metadata

{'source': '/Users/rachitjoshi/Documents/Finbot/notebooks/../data/pdfs_processed/merged/investorcom_merged.pdf'}

In [71]:
len(query_result['source_documents'][0].page_content)

11929

In [None]:
########

In [None]:
qna_chain = RetrievalQA.from_chain_type(
    base_llm, 
    retriever=vectorstore.as_retriever(
        search_type=search_type, search_kwargs=retrieval_kwargs
    )
)