In [2]:
import os
from getpass import getpass
os.environ['OPENAI_API_KEY'] = getpass('OpenAI API Key:')

In [8]:
## Init main variables

root_path = "/Users/ryanlindbeck/Development/resai/app/backend/notebooks"
doc_path = f"{root_path}/documents"
db_path = f"{root_path}/db_vector"

document_name = "mod_5_transcript.pdf"
documents_dirs = "fin570"

In [9]:
## Sample Quiz One

## Question 1 (GPT4 - correct, GPT3 - incorrect)
# question = """
# Consider a company with the following income statement (in millions):

# Income Statement:
# Operating Income: 4,259.0
# Net Interest: (850.0)
# EBT: 3,409.0
# Income Tax Expense: 681.8 
# Net Income: 2,727.2

# Consider the impact of the issuance of 12B in new debt at a 4 percent interest rate. 

# Net income after the debt issuance will be equal to what?
# """

# possible_answers = """
# A: 2,343.2
# B: 3,023.2
# C: 2,456.8
# D: 1,950.5
# """


## Question 2 (both correct)
# question = """
# Consider a company with the following income statement (in millions):

# Income Statement:
# Operating Income: 4,259.0
# Net Interest: (850.0)
# EBT: 3,409.0
# Income Tax Expense: 681.8
# Net Income: 2,727.2

# Which of the following statements is correct?
# """

# possible_answers = """
# A: If the company had issued 12B of equity rather than debt, net income would have been lower because dividends are higher than interest payments.
# B: The NPV of the 12B debt issuance is likely to be negative because it reduces the company’s net income.
# C: If markets are efficient, the NPVs of both debt and equity issuance should be close to zero. d. If the interest rate increases to 5%, the company should no longer issue debt because debt became more expensive.
# """

## Question 3 (GPT4 - correct, GPT3 - incorrect)
# question = """
# A company has a cost of capital (WACC) equal to 7.8%. The cost of equity is 9%, the cost of 
# debt is 4%, the tax rate is 30%, and debt represents 20% of the value of the firm (D/V = 20%). 

# Suppose the company increases D/V to 30 percent by issuing debt and repurchasing stock. 

# Which of the following options is correct? 
# """

# possible_answers = """
# A. The Beta of the company's equity is likely to increase following the debt issuance. 
# B. WACC will go up because the company is paying more interest, and thus, the cost of capital increases. 
# C. The company's cost of equity will decrease because the firm has more debt and less equity. 
# D. The new WACC is 7.1%.
# """

# Question 4 (GPT4 - correct, GPT3 - incorrect)
# question = """
# A company has a cost of capital (WACC) equal to 7.8%. The cost of equity is 9%, the cost of 
# debt is 4%, the tax rate is 30%, and debt represents 20% of the value of the firm (D/V = 20%). 

# Suppose the company increases D/V to 30%, by issuing debt and repurchasing stock. 
# Suppose also that the the M&M (Modigliani and Miller) result is correct. 
# If the cost of debt is still 4%, then the cost of equity will be what?

# Which of the following options is correct? 
# """

# possible_answers = """
# A. We do not know what the new cost of equity will be.
# B. Lower than 9% since the debt issuance benefits the company.
# C. Still 9% since by M&M the cost of capital does not depend on leverage. 
# D. Greater than 9%.
# """


# Question 5 (GPT3 - correct, GPT4 - incorrect)
question = """
Consider a company with the following income statement (in millions):

Operating Income: 4,259.0
Net Interest: (850.0)
EBT: 3,409.0
Income Tax Expense: 681.8 
Net Income: 2,727.2

What is the current OPAT (operating profits after taxes)?

Which of the following options is correct? 
"""

possible_answers = """
A. 3,846.3 
B. 3,409.0
C. 3,577.2
D. 2,727.2
"""

In [22]:
# Load PDF document into Chroma collection.

from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyMuPDFLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import TextSplitter, CharacterTextSplitter, SentenceTransformersTokenTextSplitter

# Sentence Transformer Embedding Function
# https://www.sbert.net/
# https://docs.trychroma.com/embeddings
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

loader = PyPDFDirectoryLoader(f"{doc_path}/{documents_dirs}")
# loader = PyMuPDFLoader(f"{doc_path}/{document_name}")
# loader = PyPDFLoader(f"{doc_path}/{document_name}")

ts = SentenceTransformersTokenTextSplitter()
documents = loader.load_and_split(text_splitter=ts)
texts = [d.page_content for d in documents]
metadatas = [d.metadata for d in documents]

# store = Chroma(persist_directory=db_path, collection_name=document_name, embedding_function=embedding_function)
store = Chroma(persist_directory=db_path, collection_name=documents_dirs, embedding_function=embedding_function)

store.add_texts(texts, metadatas=metadatas)

print(f"Done loading {len(texts)} documents into Chroma collection.")

Done loading 954 documents into Chroma collection.


In [20]:
search_result_documents = store.similarity_search(question, k=10)

print(f"Total Results: {len(search_result_documents)}")

# sort search results by page number
sorted_results = sorted(search_result_documents, key=lambda doc: int(doc.metadata['page']))

# The materials 
doc_prompts: list[str] = []

page_numbers: list[str] = []

for doc in sorted_results:
    page_content = doc.page_content
    source = ''
    page = ''
    
    if 'page' in doc.metadata:
        page = doc.metadata["page"]
        page_numbers.append(page)

    if 'source' in doc.metadata:
        source = doc.metadata["source"]

    doc_prompts.append(
f"""
Material Content:
{page_content}

Material Metadata:
Source: {source}
Page: {page}
"""
    )
    

print(f"Page Numbers: {page_numbers}")
print("Doc Prompts")
print(doc_prompts)

Total Results: 10
Page Numbers: [16, 59, 60, 61, 63, 64, 68, 75, 79, 81]
Doc Prompts
["\nMaterial Content:\ncorporate finance ii : financing investments and managing risk professors heitor almeida and stefan zeume 17 well, if you look at the income statement for a very long time, you will realize that nothing really changes. there's no interest expense. you don't pay interest on stocks, so the income statement is going to be the same. your interest is not changing at all. that means that your earnings are going to be the same. there is no change in earnings. let's move to the cash flow statement. here, instead of borrowing, we now have seven\n\nMaterial Metadata:\nSource: /Users/ryanlindbeck/Development/resai/app/backend/notebooks/documents/mod_5_transcript.pdf\nPage: 16\n", "\nMaterial Content:\ncorporate finance ii : financing investments and managing risk professors heitor almeida and stefan zeume 60 so, here's a trick question for you, and i want you to work on this very carefully.

In [21]:
from langchain import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI

llm = OpenAI(temperature=0.0)
# llm = ChatOpenAI(temperature=0, model="gpt-4")

template_heading = """
Answer the following question by selecting from one of the below possible answers.
Use the provided materials to help you answer the question.
"""

documents_text = "\n".join(doc_prompts)

print(template_heading)
print("")
print("Question: ")
print(question)
print("")
print("Possible Answers:")
print(possible_answers)
print("")
print("Materials:")
print(documents_text)
print("")

template = """
{template_heading}

Question: 
{question}

Possible Answers:
{possible_answers}

Materials:
{documents_text}
"""

prompt = PromptTemplate(
    input_variables=["template_heading", "question", "possible_answers", "documents_text"],
    template=template,
)

chain = LLMChain(llm=llm, prompt=prompt)

print("")
print("Response:")
print(chain.run({
    'template_heading': template_heading,
    'question': question,
    'possible_answers': possible_answers,
    'documents_text': documents_text
}))


Answer the following question by selecting from one of the below possible answers.
Use the provided materials to help you answer the question.


Question: 

Consider a company with the following income statement (in millions):

Operating Income: 4,259.0
Net Interest: (850.0)
EBT: 3,409.0
Income Tax Expense: 681.8 
Net Income: 2,727.2

What is the current OPAT (operating profits after taxes)?

Which of the following options is correct? 


Possible Answers:

A. 3,846.3 
B. 3,409.0
C. 3,577.2
D. 2,727.2


Materials:

Material Content:
corporate finance ii : financing investments and managing risk professors heitor almeida and stefan zeume 17 well, if you look at the income statement for a very long time, you will realize that nothing really changes. there's no interest expense. you don't pay interest on stocks, so the income statement is going to be the same. your interest is not changing at all. that means that your earnings are going to be the same. there is no change in earnings. let'

In [None]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
chain = load_qa_with_sources_chain(llm, chain_type="stuff")
chain({"input_documents": documents, "question": question}, return_only_outputs=True)

In [None]:
##### DO NOT USE #####
## Example of querying Chroma collection without Langchain.
## Search for documents in the vector database.

import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

# Note: This embedding function must match the one that was to index the collection.
# We are indexing with Langchain tho, so it must come from the Langchain library.
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

chroma_client = chromadb.Client(Settings(
            chroma_db_impl="duckdb+parquet",
            persist_directory=db_path
        ))

collection = chroma_client.get_collection(name=document_name, embedding_function=embedding_function)

result = collection.query(
    query_texts=[question],
    n_results=5
)

documents = result['documents'][0]
metadatas = result['metadatas'][0]

print("")
print(f"Results:")
print(f"Doc Count: {len(documents)}")
print("")

# todo: How to get the page number of the document that was returned?
for index, doc in enumerate(documents):
    print("")
    print(f"Doc: {index + 1}")
    print("")
    print(doc)