## Document Loading

In [1]:
import os 
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key = os.environ['OPENAI_API_KEY']

In [2]:
from langchain.document_loaders import PyPDFLoader

In [3]:
loader = PyPDFLoader("MS_Q1_fund_flows.pdf")

In [4]:
pages = loader.load()

In [5]:
len(pages)

35

In [6]:
pages[0].metadata

{'source': 'MS_Q1_fund_flows.pdf', 'page': 0}

## Document Splitting

Recursive Character Text Splitter is recommended

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150
)

In [8]:
docs = text_splitter.split_documents(pages)

In [9]:
len(docs)

64

In [10]:
docs[0]

Document(page_content='? \n \n \n \n Global  Sustainable  Fund  Flows:  Q1 2023 in Review   \nA slow start to the y ear, with lower inflows  and product development , \nbut assets  climb  to USD 2. 7 trillion.  \nKey Takeaways   \n× Global sustainable funds attracted USD 29 billion of net new money in the first quarter of 2023, down \nfrom  nearly  USD 38 billion  in the previous  quarter.  \n× Inflows were lower in most regions  as macroeconomic pressures, including rising interest rates , \ninflation , and a looming recession, continue d to weigh  on investor sentiment.  Sustainable funds in \nthe U nited States  experienced their third quarter of outflows in a year.  \n× Despite lower inflows  but helped by higher valuations , global sustainable fund assets continued their \nrecovery to hit USD 2.7 4 trillion at the end of March.   \n× Product development cooled down . Europe sa w a significant reduction of new sustainable fund \nlaunche s, amid regulatory uncertainty  and fears of 

## Vector Stores and Embeddings

### Embeddings

In [11]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

### Vector Stores

In [12]:
from langchain.vectorstores import Chroma

In [13]:
persist_directory = 'chroma/'

In [14]:
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embedding,
    persist_directory=persist_directory
)

## Similarity Search versus Maximum Marginal Relevance (MMR) Search

In [15]:
question = "Where do we see the biggest ESG fund inflow?"

In [16]:
docs = vectordb.similarity_search(question, k =3) # top 3 answers

In [24]:
docs[1].page_content

"Page  8 of 35 \n \n    \n \n    \n \n    \n \n    \n \n    \n \n    \n \n    Global  Sustainable  Fund  Flows —Q1 2023 in Review  | See Important  Disclosures.  \n \n      \n \n      \n \n      \n \n      \n \n      \n \n      \n \n      addition to sectoral exclusion s, the fund  screen s out ESG laggard s based on Swisscanto's proprietary \nrating  and aims for a 20% lower carbon dioxide equivalent intensity score than  the parent benchmark .  \n \nThe second- biggest money -gather er last quarter was BlackRock ACS World ESG Insights Equity Fund , \nwhich combines both exclusionary screens and a tilt toward companies better aligned with the longer -\nterm sustainable, transformative change.  The fund seeks to have 50% lower exposure to carbon \nemission intensity a nd potential emissions from fossil fuel reserves relative to its reference benchmark .  \n \nExhibit  8a Top 10 Sustainable  Fund  Flows  in First-Quarter  2023 \n \n  \nSource:  Morningstar  Direct,  Manager  Research.  

In [18]:
docs_mmr = vectordb.max_marginal_relevance_search(question, k =3)

In [22]:
docs_mmr[1].page_content

'and iShares ESG USD Corporate Bond ETF SUSC —led the way, netting USD 420 million and USD 159 \nmillion, respectively. Meanwhile, sustainable equity funds shed USD 5.4 billion during the quarter, their \nthird quarter of outflows in the past year.  Here, too, iShares ESG Aware MSCI USA ETF drove the overall \nnarrative.  \n  Fund NameNet Flows \n(USD Million)\niShares ESG Aware MSCI EM ETF 477\nBlackRock Sustainable Advantage Large Cap Core Fund 475\nGMO Resource Transition Fund 460\niShares ESG U.S. Aggregate Bond ETF 419\nCalvert Equity Fund 419\nBrown Advisory Sustainable Growth Fund 306\nVanguard FTSE Social Index Fund 240\nDimensional US Sustainability Core 1 ETF 209\niShares ESG USD Corporate Bond ETF 159\nVanguard ESG International Stock ETF 157'

In [23]:
print(vectordb._collection.count())

64


## Question Answering

In [25]:
# at this stage we add an LLM to enable the chatting feature to provide a response to our question
llm_name = 'gpt-3.5-turbo-0301'

In [26]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name = llm_name, temperature = 0) #factual response, not too creative

### RetrievalQA chain

In [27]:
from langchain.chains import RetrievalQA

In [28]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [29]:
result = qa_chain({"query": question})

In [30]:
result["result"]

'According to Exhibit 29, Taiwan experienced the largest net inflows at USD 710 million in the first quarter of 2023 for Asia ex-Japan Sustainable Fund Flows. However, if we look at Exhibit 8a, iShares ESG Aware MSCI EM ETF and BlackRock Sustainable Advantage Large Cap Core Fund were the top two funds in terms of net flows, both receiving USD 477 million and USD 475 million, respectively.'

### Adding Prompt

In [31]:
from langchain.prompts import PromptTemplate 


# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for going green!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [32]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [33]:
question = "Where do we see the biggest outflows?"
result = qa_chain({"query": question})

In [34]:
result["result"]

'Singapore and Hong Kong saw the biggest outflows of USD 157 million and USD 117 million, respectively, followed by India at USD 41 million in the Asia ex-Japan Sustainable Fund Flows exhibit. Thanks for going green!'

In [35]:
question = "What is the total outflow amount in Q1?"
result = qa_chain({"query": question})
result["result"]

'The global universe of sustainable funds saw net outflows of USD 29 billion in the first quarter of 2023, which was lower than the revised USD 37.7 billion of the last quarter of 2022. The Japanese sustainable fund market saw net outflows amounting to USD 961 million in the same period. Thanks for going green!'

### Adding chat history

In [37]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [38]:
question = "What is the total outflow amount in Q1?"
result = qa_chain({"query": question})
result["result"]

'The global universe of sustainable funds saw net outflows of USD 29 billion in the first quarter of 2023, which was lower than the revised USD 37.7 billion of the last quarter of 2022. The Japanese sustainable fund market saw net outflows amounting to USD 961 million in the same period. Thanks for going green!'

In [40]:
question = "Can you provide a breakdown of these outflows?"
result = qa_chain({"query": question})
result["result"]

'Yes, the outflows were driven by electric car manufacturers, and Singapore and Hong Kong saw the largest outflows followed by India. On the other hand, Taiwan experienced the largest net inflows while Malaysia saw the second-most net inflows. Thanks for going green!'

In [41]:
question = "Do you have more figures on the previous answer?"
result = qa_chain({"query": question})
result["result"]

'No, there is no additional information provided on the previous answer. Thanks for going green!'