In [1]:
import os
from getpass import getpass
os.environ['OPENAI_API_KEY'] = getpass('OpenAI API Key:')

In [2]:
## Init main variables

root_path = "/Users/ryanlindbeck/Development/resai/app/backend/notebooks"
doc_path = f"{root_path}/documents"
db_path = f"{root_path}/vector;..db"

document_name = "mod_5_transcript.pdf"
documents_dirs = "fin570"

In [15]:
# Questions

# Question 1
# question = """
# 1) Describe your company's capital structure over the last 3 years. 
# 2) Describe your competitors capital structure over the last 3 years. 
# 3) Compare the capital structures of your company's and your competitors.  
# """

question_part = """
Describe your Microsoft's capital structure over the last 3 years.
"""

notes = """
Note:
You should analyze leverage ratios in this question. 
Use the market value of equity to calculate leverage ratios. 
Focus on the leverage ratio defined as total debt / (total debt + market value of equity). 

Refer to historical data on the market value of equity for your companies.
"""

cap_struct = """
Microsoft Capital Structure (USD - Millions):
Year 2020:
- Total Debt: 82110.0
- Total Common Equity: 118,304.00

Year 2021:
- Total Debt: 82,278.00
- Total Common Equity: 141,988.00

Year 2022:
- Total Debt: 78,400.00
- Total Common Equity: 166,542.00


Apple Capital Structure (USD - Millions):
Year 2020:
- Total Debt: 122,278.00
- Total Common Equity: 65,339.00

Year 2021:
- Total Debt: 136522.00
- Total Common Equity: 63090.00

Year 2022:
- Total Debt: 132,480.00
- Total Common Equity: 50,672.00
"""

question = f"""
{question_part}

{notes}

{cap_struct}
"""

In [3]:
# Load documents into Vector database.

from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyMuPDFLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import TextSplitter, CharacterTextSplitter, SentenceTransformersTokenTextSplitter

# Sentence Transformer Embedding Function
# https://www.sbert.net/
# https://docs.trychroma.com/embeddings
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

loader = PyPDFDirectoryLoader(f"{doc_path}/{documents_dirs}")
# loader = PyMuPDFLoader(f"{doc_path}/{document_name}")
# loader = PyPDFLoader(f"{doc_path}/{document_name}")

ts = SentenceTransformersTokenTextSplitter()
documents = loader.load_and_split(text_splitter=ts)
texts = [d.page_content for d in documents]
metadatas = [d.metadata for d in documents]

# store = Chroma(persist_directory=db_path, collection_name=document_name, embedding_function=embedding_function)
store = Chroma(persist_directory=db_path, collection_name=documents_dirs, embedding_function=embedding_function)

store.add_texts(texts, metadatas=metadatas)

print(f"Done loading {len(texts)} documents into Chroma collection.")

  from .autonotebook import tqdm as notebook_tqdm


Done loading 954 documents into Chroma collection.


In [16]:
# Search for similar documents.

search_result_documents = store.similarity_search(question, k=5)

print(f"Total Results: {len(search_result_documents)}")

# sort search results by page number
sorted_results = sorted(search_result_documents, key=lambda doc: int(doc.metadata['page']))

# The materials 
doc_prompts: list[str] = []

page_numbers: list[str] = []

for doc in sorted_results:
    page_content = doc.page_content
    source = ''
    page = ''
    
    if 'page' in doc.metadata:
        page = doc.metadata["page"]
        page_numbers.append(page)

    if 'source' in doc.metadata:
        source = doc.metadata["source"]

    doc_prompts.append(
f"""
Material Content:
{page_content}

Material Metadata:
Source: {source}
Page: {page}
"""
    )
    

print(f"Page Numbers: {page_numbers}")
print("Doc Prompts")
print(doc_prompts)

Total Results: 5
Page Numbers: [50, 55, 99, 106, 107]
Doc Prompts
["\nMaterial Content:\ncorporate finance i : measuring and promoting value creation professors stefan zeume & heitor almeida my recommendation is that we look at ratios number two and number three. but i wanted to discuss that over assets because that ratio is commonly used, and you might encounter it when you read beyond this course. i thought it was important that we discussed that first ratio but we will focus on ratios number two and three. before let's try to calculate these leverage ratios using data from real world companies. okay. and let's focus on altice first, let's figure out how much leverage this company actually has. you see all the data on the slide, it's taken from capital like you. on the left, you have data on the\n\nMaterial Metadata:\nSource: /Users/ryanlindbeck/Development/resai/app/backend/notebooks/documents/fin570/module1.pdf\nPage: 50\n", "\nMaterial Content:\ncorporate finance i : measuring and

In [17]:
from langchain import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI

llm = OpenAI(temperature=0.0)
# llm = ChatOpenAI(temperature=0, model="gpt-4")

template_heading = """
Answer each of the following questions. Please provide a detailed answer on how you arrived at your answer.
Use the provided materials to help you answer the question.
"""

documents_text = "\n".join(doc_prompts)

print(template_heading)
print("")
print("Question: ")
print(question)
print("")
print("Materials:")
print(documents_text)
print("")

template = """
{template_heading}

Question: 
{question}

Materials:
{documents_text}
"""

prompt = PromptTemplate(
    input_variables=["template_heading", "question", "documents_text"],
    template=template,
)

chain = LLMChain(llm=llm, prompt=prompt)

print("")
print("Response:")
print(chain.run({
    'template_heading': template_heading,
    'question': question,
    'documents_text': documents_text
}))


Answer each of the following questions. Please provide a detailed answer on how you arrived at your answer.
Use the provided materials to help you answer the question.


Question: 


Describe your Microsoft's capital structure over the last 3 years.



Note:
You should analyze leverage ratios in this question. 
Use the market value of equity to calculate leverage ratios. 
Focus on the leverage ratio defined as total debt / (total debt + market value of equity). 

Refer to historical data on the market value of equity for your companies.



Microsoft Capital Structure (USD - Millions):
Year 2020:
- Total Debt: 82110.0
- Total Common Equity: 118,304.00

Year 2021:
- Total Debt: 82,278.00
- Total Common Equity: 141,988.00

Year 2022:
- Total Debt: 78,400.00
- Total Common Equity: 166,542.00


Apple Capital Structure (USD - Millions):
Year 2020:
- Total Debt: 122,278.00
- Total Common Equity: 65,339.00

Year 2021:
- Total Debt: 136522.00
- Total Common Equity: 63090.00

Year 2022:
- Total

In [None]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
chain = load_qa_with_sources_chain(llm, chain_type="stuff")
chain({"input_documents": documents, "question": question}, return_only_outputs=True)

In [None]:
##### DO NOT USE #####
## Example of querying Chroma collection without Langchain.
## Search for documents in the vector database.

import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

# Note: This embedding function must match the one that was to index the collection.
# We are indexing with Langchain tho, so it must come from the Langchain library.
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

chroma_client = chromadb.Client(Settings(
            chroma_db_impl="duckdb+parquet",
            persist_directory=db_path
        ))

collection = chroma_client.get_collection(name=document_name, embedding_function=embedding_function)

result = collection.query(
    query_texts=[question],
    n_results=5
)

documents = result['documents'][0]
metadatas = result['metadatas'][0]

print("")
print(f"Results:")
print(f"Doc Count: {len(documents)}")
print("")

# todo: How to get the page number of the document that was returned?
for index, doc in enumerate(documents):
    print("")
    print(f"Doc: {index + 1}")
    print("")
    print(doc)