In [3]:
import os
from langchain_openai import AzureChatOpenAI

os.environ["OPENAI_API_VERSION"] = "2024-02-15-preview"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://openaimodelv3si.openai.azure.com/"
os.environ["AZURE_OPENAI_API_KEY"] = "..."

llm = AzureChatOpenAI(
    openai_api_version="2023-05-15",
    azure_deployment="trial1",
)

from langchain.chains import RetrievalQA
from langchain.prompts import (
    PromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
)

review_template = """Your job is to use ESG (A sustainability report is a report published by companies on the environmental, social and governance (ESG) impacts of their activities) 
documents and annual reports to answer questions. Use
the following context to answer questions. Be as detailed as possible, but
don't make up any information that's not from the context. If you don't know
an answer, say you don't know.
{context}
"""

esg_system_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(input_variables=["context"], template=review_template)
)

esg_human_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(input_variables=["question"], template="{question}")
)
messages = [esg_system_prompt, esg_human_prompt]

esg_prompt = ChatPromptTemplate(
    input_variables=["context", "question"], messages=messages
)

In [7]:
from langchain_community.document_loaders import PyPDFLoader

# loader = PyPDFLoader("/Users/aryan/Desktop/fargowells/mlpoc/hackathon/sustainability-survey-automation/training-data/2021-annual-report.pdf, /Users/aryan/Desktop/fargowells/mlpoc/hackathon/sustainability-survey-automation/training-data/climate-disclosure.pdf")
# pages = loader.load_and_split()

dir_path = "/Users/aryan/Desktop/fargowells/mlpoc/hackathon/sustainability-survey-automation/training-data"

def process_pdf_files(dir_path):
    paths = os.listdir(dir_path)
    knowledge = []
    
    for path in paths:
        if path.endswith(".pdf"):
            loader = PyPDFLoader(os.path.join(dir_path, path))
            pages = loader.load_and_split()
            knowledge.extend(pages)
            print(len(knowledge))
    
    return knowledge

knowledge = process_pdf_files(dir_path)

from langchain_openai import AzureOpenAIEmbeddings
# from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma

embeddings = AzureOpenAIEmbeddings(
    azure_deployment="embed1",
    openai_api_version=os.getenv("OPENAI_API_VERSION"),
)
vector = Chroma.from_documents(knowledge, embeddings)

3
38
79
259
612
673
680
1065
1066
1077
1087
1096
1190
1559
1630
1664
1678
1716
1733
1752
1855


#### direct pickle store not working

In [9]:
import pickle
# Specify the file path where you want to save the vector
vector_file_path = "/Users/aryan/Desktop/fargowells/mlpoc/hackathon/all_21_knowledge.pkl"

# Save the vector to the file
with open(vector_file_path, "wb") as file:
    pickle.dump(vector, file)

TypeError: cannot pickle 'sqlite3.Connection' object

### add documents one by one

In [26]:
def add_new_doc(path, pages):
    loader = PyPDFLoader(path)
    new_pages = loader.load_and_split()
    pages.extend(new_pages)
    vector = Chroma.from_documents(pages, embeddings)
    return pages, vector

In [27]:
pages, vector = add_new_doc("/Users/aryan/Desktop/fargowells/mlpoc/hackathon/sustainability-survey-automation/training-data/climate-disclosure.pdf", pages)
len(pages)

414

#### trying FAISS store

In [10]:
from langchain_community.vectorstores import FAISS
loader = PyPDFLoader("/Users/aryan/Desktop/fargowells/mlpoc/hackathon/sustainability-survey-automation/training-data/2021-annual-report.pdf")
pages = loader.load_and_split()

embeddings = AzureOpenAIEmbeddings(
    azure_deployment="embed1",
    openai_api_version=os.getenv("OPENAI_API_VERSION"),
)
vector = FAISS.from_documents(pages, embeddings)

In [13]:
# vector = FAISS.from_documents(docs, embedding_function) 
vector2 = vector.save_local("faiss_index") 
vector3 = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [23]:
vector3

<langchain_community.vectorstores.faiss.FAISS at 0x14f088110>

###### change here

In [17]:
# retriever = vector.as_retriever()
retriever = vector3.as_retriever()

In [18]:
esg_vector_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

query = "Identify the position(s) (do not include any names) of the individual(s) on the board with responsibility for climate-related issues."
response = esg_vector_chain.invoke(query)
response

{'query': 'Identify the position(s) (do not include any names) of the individual(s) on the board with responsibility for climate-related issues.',
 'result': 'The position on the board with responsibility for climate-related issues is the Chairman of the Corporate Responsibility Committee.',
 'source_documents': [Document(page_content='Board of Directors \nSteven D. Black Maria R. Morris \nChairman, Retired Executive Vice President and Head of \nWells Fargo & Company Global Employee Benefits business, MetLife, Inc. \nMark A. Chancy Richard B. Payne, Jr. \nRetired Vice Chairman and Co-Chief Operating Retired Vice Chairman, Wholesale Banking, \nOfficer, SunTrust Banks, Inc. U.S. Bancorp \nCeleste A. Clark Juan A. Pujadas \nPrincipal, Abraham Clark Consulting, LLC, and Retired Senior Retired Principal, PricewaterhouseCoopers LLP, \nVice President, Global Public Policy and External Relations and former Vice Chairman, Global Advisory Services, \nand Chief Sustainability Officer, Kellogg Com

#### vectorspace stored locally using FAISS ABOVE
### now trying to store all knowledge in a single file

In [19]:
knowledge

[Document(page_content='© 2023 Wells Fargo & Company. All rights reserved.  \nUPN USCon2311 (Rev. 10/04/23 ) Page 1 of 3  Wells Fargo U.S. Consumer Privacy Notice  Rev. 10/23 \n \n   FA CTS WHAT DOES WELLS FARGO DO \nWITH YOUR PERSONAL INFORMATION? \nWhy?  Financial companies choose how they share your personal information. Federal law gives consumers \nthe right to limit some but not all sharing. Federal law also requires us to tell you how we collect, \nshare, and protect your personal information. Please read this notice carefully to understand what \nwe do. \nWhat? The types of personal information we collect, and share depend on the product or service you \nhave with us. This information can include: \n• Social Security number and employment information  \n• Account balances and transaction history  \n• Credit history and investment experience  \nHow? All financial companies need to share customers’ personal information to run their everyday business. \nIn the section below, we li

In [21]:
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = AzureOpenAIEmbeddings(
    azure_deployment="embed1",
    openai_api_version=os.getenv("OPENAI_API_VERSION"),
)
vector_all_knowledge = FAISS.from_documents(knowledge, embeddings)

In [24]:
vector_all_knowledge.save_local("faiss_index_all_21_knowledge")
vector_all_knowledge_retrived = FAISS.load_local("faiss_index_all_21_knowledge", embeddings, allow_dangerous_deserialization=True)

In [25]:
vector_all_knowledge.similarity_search("Identify the position(s) (do not include any names) of the individual(s) on the board with responsibility for climate-related issues.")
vector_all_knowledge_retrived.similarity_search("Identify the position(s) (do not include any names) of the individual(s) on the board with responsibility for climate-related issues.")

[Document(page_content='18 Wells Fargo | 2022 TCFD Report Governance Strategy Risk management Metrics and targets \nSignificant matters involving climate-related risks and opportunities may be escalated to the management \ngovernance committees aligned to impacted lines of businesses or functions. For example, the Public Affairs Risk and Control Committee may be briefed on the risks and controls associated with significant voluntary ESG disclosures, whereas the risk and control committee aligned to a particular business might review the risk impacts of a climate-related strategy or opportunity. In this way, our approach to the risk impacts of climate-related strategies and opportunities is integrated into the existing governance structure of the Company. \nSteering committees and other forums \nTo facilitate enhanced management attention to emerging risks and opportunities, we also rely on steering committees and other forums with a climate-specific focus. These include forums like the