In [1]:
from dotenv import load_dotenv
import os
import re, json
import requests
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders.dataframe import DataFrameLoader
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
# from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQAWithSourcesChain
from genai.model import Credentials
from genai.schemas import GenerateParams
from genai.extensions.langchain import LangChainInterface
import time
from pypdf import PdfReader
import pandas as pd
import math

import logging
logging.basicConfig(level=logging.INFO)

load_dotenv('/Users/alexsteiner/.env')



True

In [2]:
# %%time
# hf_embeddings = HuggingFaceEmbeddings()

In [3]:
# %%time
# pdf_reader = PdfReader('data/IBM_Annual_Report_2022.pdf')
# text = ""
# for page in pdf_reader.pages:
#     text += page.extract_text()

In [4]:
# %%time
# text_splitter = CharacterTextSplitter(
# separator="\n",
# chunk_size=1024,
# chunk_overlap=0.1 * 1024,
# length_function=len
# )

In [5]:
# %%time
# docs = text_splitter.split_text(text)

In [6]:
# %%time
# vectorstore = FAISS.from_texts(docs, embedding=hf_embeddings)

In [7]:
%%time
hf_embeddings = HuggingFaceEmbeddings()
def get_vector_index(file):

    pdf_reader = PdfReader(file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()

    # split into chunks
    text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1024,
    chunk_overlap=0.01 * 1024,
    length_function=len
    )

    docs = text_splitter.split_text(text)
    vectorstore = FAISS.from_texts(docs, embedding=hf_embeddings)
    
    return vectorstore

vectorstore = get_vector_index('data/_10-K-2022-(As-Filed).pdf')

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /var/folders/zf/vsn0hgrx6bbb2ff0qbcd3n4m0000gn/T/tmpgovp66fk
INFO:torch.distributed.nn.jit.instantiator:Writing /var/folders/zf/vsn0hgrx6bbb2ff0qbcd3n4m0000gn/T/tmpgovp66fk/_remote_module_non_scriptable.py
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:faiss.loader:Loading faiss.
INFO:faiss.loader:Successfully loaded faiss.


CPU times: user 1min 31s, sys: 26.9 s, total: 1min 58s
Wall time: 35.7 s


In [11]:
topic = '''major_events: Highlight of significant events, acquisitions, or strategic shifts that occurred over the past year.'''
relevant_docs = vectorstore.similarity_search(topic,3)
relevant_content = ''
for doc in reversed(relevant_docs):
    relevant_content += 'Document:\n' + doc.page_content + '\n\n'   

print(f'{relevant_content}')

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Document:
Consolidated Statements of Comprehensive Income for the years ended September 24, 2022, September 25, 
2021 and September 26, 2020 30
Consolidated Balance Sheets as of September 24, 2022 and September 25, 2021 31
Consolidated Statements of Shareholders’ Equity for the years ended September 24, 2022, September 25, 2021 
and September 26, 2020 32
Consolidated Statements of Cash Flows for the years ended September 24, 2022, September 25, 2021 and 
September 26, 2020 33
Notes to Consolidated Financial Statements 34
Reports of Independent Registered Public Accounting Firm* 50
* Ernst & Young LLP, PCAOB Firm ID No. 000 42.
(2)Financial Statement Schedules
All financial statement schedules have been omitted, since the required information is not applicable or is not present in amounts 
sufficient to require submission of the schedule, or because the information required is included in the consolidated financial 
statements and accompanying notes included in this Form 10-K.

Document

In [12]:
API_KEY = os.getenv("BAM_KEY", None)
API_ENDPOINT = os.getenv("BAM_API", None)
API_ENDPOINT=API_ENDPOINT.replace('generate','')
creds = Credentials(API_KEY, API_ENDPOINT)

params = GenerateParams(
    decoding_method = "greedy",
    min_new_tokens = 1,
    max_new_tokens = 1000,
    #temperature = 0.7,
    #top_k = 50,
    #top_p = 1,
    repetition_penalty = 1.2
) 

llm = LangChainInterface(
    model='meta-llama/llama-2-70b-chat', 
    credentials=creds,
    params=params
)

In [13]:
prompt = '''<<SYS>>
You are a financial analyst who reads 10k files and writes reports on topics.
Base your answers on the Relevant Information Section only and do not make up any information.
Return answer using Bullet points like the following:

Report: 
•
•
•
<</SYS>>

[INST]
Relevant Information: 
{docs}

Topic: {topic}[/INST]
Report:'''

In [14]:
to_send = prompt.format(docs = relevant_content.strip(), topic=topic)

print(f'{to_send}')

<<SYS>>
You are a financial analyst who reads 10k files and writes reports on topics.
Base your answers on the Relevant Information Section only and do not make up any information.
Return answer using Bullet points like the following:

Report: 
•
•
•
<</SYS>>

[INST]
Relevant Information: 
Document:
Consolidated Statements of Comprehensive Income for the years ended September 24, 2022, September 25, 
2021 and September 26, 2020 30
Consolidated Balance Sheets as of September 24, 2022 and September 25, 2021 31
Consolidated Statements of Shareholders’ Equity for the years ended September 24, 2022, September 25, 2021 
and September 26, 2020 32
Consolidated Statements of Cash Flows for the years ended September 24, 2022, September 25, 2021 and 
September 26, 2020 33
Notes to Consolidated Financial Statements 34
Reports of Independent Registered Public Accounting Firm* 50
* Ernst & Young LLP, PCAOB Firm ID No. 000 42.
(2)Financial Statement Schedules
All financial statement schedules have be

In [15]:
resonse = llm(to_send)
print(f'{resonse}')

INFO:httpx:HTTP Request: GET https://bam-api.res.ibm.com/v1/generate/limits "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://bam-api.res.ibm.com/v1/generate "HTTP/1.1 200 OK"
INFO:genai.extensions.langchain.llm:Output of GENAI call: 

• The company experienced a challenging year in 2022, facing various obstacles including the COVID-19 pandemic, global economic uncertainty, and increased competition in the tech industry.
• Despite these challenges, Apple's net sales reached a record high of $400 billion, driven by strong demand for its products and services.
• The company continued to invest heavily in research and development, with expenses increasing by 15% year-over-year to $14.2 billion.
• Apple's focus on innovation and customer satisfaction has allowed it to maintain its position as a leader in the technology sector, despite intensifying competition.
• The company has implemented various measures to mitigate the impact of the pandemic on its operations, including remote wor



• The company experienced a challenging year in 2022, facing various obstacles including the COVID-19 pandemic, global economic uncertainty, and increased competition in the tech industry.
• Despite these challenges, Apple's net sales reached a record high of $400 billion, driven by strong demand for its products and services.
• The company continued to invest heavily in research and development, with expenses increasing by 15% year-over-year to $14.2 billion.
• Apple's focus on innovation and customer satisfaction has allowed it to maintain its position as a leader in the technology sector, despite intensifying competition.
• The company has implemented various measures to mitigate the impact of the pandemic on its operations, including remote work arrangements and increased investments in digital infrastructure.
• Looking ahead, Apple remains well-positioned for long-term growth, thanks to its strong brand, loyal customer base, and commitment to innovation.


In [13]:
resonse

"\n• Launch of new products and services, such as the iPhone 14 series, AirPods Pro 2, and Apple Watch Series 8\n• Expansion of Apple's original content offerings through Apple TV+, including new shows and partnerships with prominent studios and talent\n• Continued growth of Apple's service segments, including the App Store, Apple Music, Apple Pay, and iCloud\n• Strategic shift towards more sustainable and environmentally friendly practices, including the use of recycled materials in product manufacturing and renewable energy sources for powering facilities\n• Acquisition of several companies, including Intel's smartphone modem business and Dark Sky, a weather app developer\n• Collaboration with Amazon to offer Apple TV+ on Amazon's Fire TV streaming devices\n• Entry into new markets, such as India, where Apple launched its first official online store"

In [None]:
def answer_question(question, db):
    load_dotenv('/Users/alexsteiner/.env')
    API_KEY = os.getenv("BAM_KEY", None)
    API_ENDPOINT = os.getenv("BAM_API", None)
    creds = Credentials(API_KEY, API_ENDPOINT)

    params = GenerateParams(
        decoding_method = "sample",
        min_new_tokens = 1,
        max_new_tokens = 500,
        temperature = 0.7,
        top_k = 50,
        top_p = 1,
        repetition_penalty = 1.2
    )

    llm = LangChainInterface(
        model='meta-llama/llama-2-70b-chat', 
        credentials=creds,
        params=params
    )

    relevant_docs = db.similarity_search(question)
    relevant_content = ''
    for doc in reversed(relevant_docs):
        relevant_content += '```\n' + 'Content: ' + doc.page_content + '\n```\n'     

    prompt = '''<<SYS>>
You are a financial analyst who reads 10k files and reports back. 
Return answer using Bullet points like the following:

Answer: 
•
•
<</SYS>>

[INST]
Relavant Information: 
{docs}

Question: {question}[/INST]
Answer:'''

    to_send = prompt.format(docs = relevant_content.strip(), question=question)

    response = llm(to_send)

    return response

In [None]:
fiscal_year = {
    "performance_highlights": "Key performance and financial stats over the fiscal year.",
    "major_events": "Highlight of significant events, acquisitions, or strategic shifts that occurred during the year.",
    "challenges_encountered": "Challenges the company faced during the year and, if and how they managed or overcame them."
}

fiscal_year_attributes = ["performance_highlights", "major_events", "challenges_encountered"]

strat_outlook = {
    "strategic_initiatives": "The company's primary objectives and growth strategies for the upcoming years.",
    "market_outlook": "Insights into the broader market, competitive landscape, and industry trends the company anticipates.",
    "product_roadmap": "Upcoming launches, expansions, or innovations the company plans to roll out."
}

strat_outlook_attributes = ["strategic_initiatives", "market_outlook", "product_roadmap"]

risk_management = {
    "risk_factors": "Primary risks the company acknowledges.",
    "risk_mitigation": "Strategies for managing these risks."
}

risk_management_attributes = ["risk_factors", "risk_mitigation"]

innovation = {
    "r_and_d_activities": "Overview of the company's focus on research and development, major achievements, or breakthroughs.",
    "innovation_focus": "Mention of new technologies, patents, or areas of research the company is diving into."
}

innovation_attributes = ["r_and_d_activities", "innovation_focus"]


def report_insights(fields_to_include, section_num):

    fields = None
    attribs = None

    if section_num == 1:
        fields = fiscal_year
        attribs = fiscal_year_attributes
    elif section_num == 2:
        fields = strat_outlook
        attribs = strat_outlook_attributes
    elif section_num == 3:
        fields = risk_management
        attribs = risk_management_attributes
    elif section_num == 4:
        fields = innovation
        attribs = innovation_attributes

    ins = {}
    for i, field in enumerate(attribs):
        if fields_to_include[i]:
            qqq = field + ': ' + fields[field]
            print(qqq)
            response = answer_question(question=qqq, db = vectorstore)
            ins[field] = response

    return {
        "insights": ins
    }

In [None]:
insights = report_insights([True, False], 4)

In [None]:
to_print = insights['insights']['r_and_d_activities']

print(f'{to_print}')

In [None]:
to_print