In [22]:
! pip install -q langchain-openai langchain langchain-text-splitters lxml octoai-sdk langchain-community faiss-cpu tiktoken transformers
# ! zsh standalone_embed.sh start

In [23]:
from dotenv import load_dotenv
import os

load_dotenv()
OCTOAI_API_TOKEN = os.environ['OCTOAI_API_TOKEN']
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [24]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter

url = "https://www.microsoft.com/en-us/corporate-responsibility/sustainability-journey"

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h4", "Header 4"),
    ("div", "Divider")
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# for local file use html_splitter.split_text_from_file(<path_to_file>)
html_header_splits = html_splitter.split_text_from_url(url)

In [25]:
chunk_size = 1024
chunk_overlap = 128
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
)

# Split
splits = text_splitter.split_documents(html_header_splits)

In [26]:
from langchain.vectorstores import FAISS

In [27]:
from langchain_community.embeddings import OctoAIEmbeddings
from langchain_community.llms.octoai_endpoint import OctoAIEndpoint
llm = OctoAIEndpoint(
        model="llama-2-13b-chat-fp16",
        max_tokens=1024,
        presence_penalty=0,
        temperature=0.1,
        top_p=0.9,
        
    )
embeddings = OctoAIEmbeddings(endpoint_url="https://text.octoai.run/v1")

                model was transferred to model_kwargs.
                Please confirm that model is what you intended.


In [28]:
vector_store = FAISS.from_documents(
    splits,
    embedding=embeddings
)

In [29]:
retriever = vector_store.as_retriever()

In [30]:
from langchain.prompts import ChatPromptTemplate
template="""You are a sustainability report writer. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
Question: {question} 
Context: {context} 
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [31]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [32]:
chain.invoke("Write a report to the SEC that discloses information about the registrant’s climate-related targets or goals, if any, that have materially affected or are reasonably likely to materially affect the registrant's business, results of operations, or financial condition. Disclosures would include material expenditures and material impacts on financial estimates and assumptions as a direct result of the target or goal or actions taken to make progress toward meeting such target or goal.")

" \n\nAs a sustainability report writer, I am pleased to provide the following report to the SEC regarding Microsoft's climate-related targets or goals that have materially affected or are reasonably likely to materially affect the registrant's business, results of operations, or financial condition.\n\nMicrosoft has set several climate-related targets and goals, which have had a significant impact on the company's business, results of operations, and financial condition. One of the most notable targets is the company's commitment to become carbon negative by 2030. This goal has led to significant investments in renewable energy, energy efficiency, and carbon offsetting.\n\nIn 2020, Microsoft announced that it would invest $1 billion in a clean energy fund to support the development of renewable energy projects around the world. This investment has helped to reduce the company's reliance on fossil fuels and has also created new business opportunities for Microsoft.\n\nIn addition to it