In [2]:
import os
PINECONE_API_KEY = "0aa2686e-8e56-4f53-8aff-598bbfaa3570"

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [3]:
import fitz

file_path = "microsoft-annual-report.pdf"
doc = fitz.open(file_path)

In [4]:
from llama_index.core.node_parser import SentenceSplitter

text_parser = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=50
)

text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_parser.split_text(page_text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [5]:
from llama_index.core.schema import TextNode

nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc_idx = doc_idxs[idx]
    src_page = doc[src_doc_idx]
    nodes.append(node)

print(nodes[0].metadata)
# print a sample node
print(nodes[0].get_content(metadata_mode="all"))

{}
Dear shareholders, colleagues, customers, and partners,  
We are living through a time of historic challenge and opportunity. As I write this, the world faces ongoing economic, 
social, and geopolitical volatility. At the same time, we have entered a new age of AI that will fundamentally transform 
productivity for every individual, organization, and industry on earth, and help us address some of our most pressing 
challenges.  
This next generation of AI will reshape every software category and every business, including our own. Forty-eight years 
after its founding, Microsoft remains a consequential company because time and time again—from PC/Server, to Web/
Internet, to Cloud/Mobile—we have adapted to technological paradigm shifts. Today, we are doing so once again, as we 
lead this new era.  
Amid this transformation, our mission to empower every person and every organization on the planet to achieve 
more remains constant. As a company, we believe we can be the democratizing fo

In [6]:
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama


model_name = "llama3.1"
embed_model = HuggingFaceEmbedding()

for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

In [16]:

from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec


pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "financial-annual-report-rag-1"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        index_name,
        dimension=384,
        metric="euclidean",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

pinecone_index = pc.Index(index_name)

vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

In [17]:
vector_store.add(nodes)

Upserted vectors:   0%|          | 0/187 [00:00<?, ?it/s]

['df9d684b-5c0a-4210-a851-93e768b66e11',
 '1df47e31-99ff-4560-8ea0-97fc27904005',
 '353e399d-e7f8-4330-8a37-11bfbbeeedea',
 '9f5b3a60-f9d3-4f3c-9f27-25fab5e37dba',
 '287d7400-fce1-4e7d-947f-f1063191e180',
 '67bc53e6-0469-4a9f-9081-b13496c7e3bd',
 'fa609c65-9fc2-43b7-9f6f-dbffed09e66c',
 '66a034b9-2d21-459b-8d79-4e5288385950',
 '0ab7dd02-b043-4880-94ce-545495b6e48d',
 'a2f9f427-1a32-49d2-b4ce-de9c6e81dbcd',
 'c0771cec-58dd-411c-8d1b-09f4809ea23f',
 '092c8b67-b0ec-469b-814e-41ae265024d0',
 '4041685a-c03f-4f87-a2ce-9aca6e030a89',
 'fea6cc56-9e48-4daa-b229-ee5362dc30cc',
 '67230541-2a01-4361-911d-4745c433ea68',
 '0756ebda-ce34-45e1-91fb-79f831ecb27d',
 'd4ea6693-c930-4c23-a111-8c63fa962e2a',
 'ca34b679-6202-41fe-9a7f-3e4bc0981ede',
 'e8e89e33-be23-497b-a264-b863371a26dd',
 '81122b9e-da72-4624-8396-6371d2a80a25',
 '8152d29a-2066-478b-8183-47c35d49bd8f',
 'b1c03b24-17df-476b-819c-b7af8ee7deb1',
 'f4c95fa1-69ec-4228-b919-21cbc4a75788',
 '703cf568-69a4-4046-85e9-7cc4679f8cbf',
 'b05fff3c-be09-

In [92]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

query_str = "How many shares of common stock did we repurchase under the share repurchase programs?"
query_embedding = embed_model.get_query_embedding(query_str)

In [93]:
from llama_index.core.vector_stores import VectorStoreQuery

query_mode = "default"


vector_store_query = VectorStoreQuery(
    query_embedding=query_embedding, similarity_top_k=10, mode=query_mode
)

# returns a VectorStoreQueryResult
query_result = vector_store.query(vector_store_query)

In [94]:
from llama_index.core.schema import NodeWithScore
from typing import Optional

nodes_with_scores = []
for index, node in enumerate(query_result.nodes):
    score: Optional[float] = None
    if query_result.similarities is not None:
        score = query_result.similarities[index]
    nodes_with_scores.append(NodeWithScore(node=node, score=score))

In [95]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List


class PineconeRetriever(BaseRetriever):
    """Retriever over a pinecone vector store."""

    def __init__(
        self,
        vector_store: PineconeVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Init params."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve."""
        if query_bundle.embedding is None:
            query_embedding = self._embed_model.get_query_embedding(
                query_bundle.query_str
            )
        else:
            query_embedding = query_bundle.embedding

        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = self._vector_store.query(vector_store_query)

        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            score: Optional[float] = None
            if query_result.similarities is not None:
                score = query_result.similarities[index]
            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

In [96]:
retriever = PineconeRetriever(
    vector_store, embed_model, query_mode="default", similarity_top_k=10
)

In [97]:
from llama_index.core.response.notebook_utils import display_source_node

retrieved_nodes = retriever.retrieve(query_str)
for node in retrieved_nodes:
    display_source_node(node, source_length=1000)

**Node ID:** 0756ebda-ce34-45e1-91fb-79f831ecb27d<br>**Similarity:** 0.16111505<br>**Text:** ISSUER PURCHASES OF EQUITY SECURITIES, DIVIDENDS, AND STOCK PERFORMANCE  
MARKET AND STOCKHOLDERS  
Our common stock is traded on the NASDAQ Stock Market under the symbol MSFT. On July 24, 2023, there were 83,883 
registered holders of record of our common stock.  
SHARE REPURCHASES AND DIVIDENDS  
Share Repurchases  
On September 18, 2019, our Board of Directors approved a share repurchase program authorizing up to $40.0 billion in 
share repurchases. This share repurchase program commenced in February 2020 and was completed in November 2021.  
On September 14, 2021, our Board of Directors approved a share repurchase program authorizing up to $60.0 billion in 
share repurchases. This share repurchase program commenced in November 2021, following completion of the program 
approved on September  18, 2019, has no expiration date, and may be terminated at any time. As of June  30, 2023, 
$22.3 billion remained of this $60.0 billion share repurchase program.  
We repurchased the follow...<br>

**Node ID:** 8d87dc29-086a-40e6-92c0-2c79b5f1c100<br>**Similarity:** 0.16808784<br>**Text:** On September 14, 2021, our Board of Directors approved a share repurchase program authorizing up to $60.0 billion in 
share repurchases. This share repurchase program commenced in November 2021, following completion of the program 
approved on September  18, 2019, has no expiration date, and may be terminated at any time. As of June  30, 2023, 
$22.3 billion remained of this $60.0 billion share repurchase program.  
We repurchased the following shares of common stock under the share repurchase programs:  
  
  
All repurchases were made using cash resources. Shares repurchased during fiscal year 2023 and the fourth and third 
quarters of fiscal year 2022 were under the share repurchase program approved on September  14, 2021. Shares 
repurchased during the second quarter of fiscal year 2022 were under the share repurchase programs approved on both 
September 14, 2021 and September 18, 2019. All other shares repurchased were under the share repurchase program 
approved on September  ...<br>

**Node ID:** af26751c-4b3c-49fe-bee9-103183073b3e<br>**Similarity:** 0.219495893<br>**Text:** STOCKHOLDERS’ EQUITY STATEMENTS  
  
Refer to accompanying notes.  
  
(In millions, except per share amounts)
 
 
 
 
 
 
 
 
Year Ended June 30,
2023
2022
2021
 
 
 
 
Common stock and paid-in capital
 
 
 
Balance, beginning of period
$	
86,939	
$	
83,111	
$	
  80,552	
Common stock issued
	
1,866	
	
1,841	
	
1,963	
Common stock repurchased
	
(4,696	)
	
(5,688	)
	
(5,539	)
Stock-based compensation expense
	
9,611	
	
7,502	
	
6,118	
Other, net
	
(2	)
	
173	
	
17	
 
 
 
Balance, end of period
	
93,718	
	
86,939	
	
83,111	
 
 
 
Retained earnings
 
 
 
Balance, beginning of period
	
84,281	
	
57,055	
	
34,566	
Net income
	
72,361	
	
72,738	
	
61,271	
Common stock cash dividends
	
(20,226	)
	
(18,552	)
	
(16,871	)
Common stock repurchased
	
(17,568	)
	
(26,960	)
	
(21,879	)
Cumulative effect of accounting changes
	
0	
	
0	
	
(32	)
 
 
 
Balance, end of period
	
118,848	
	
84,281	
	
57,055	
 
 
 
Accumulated other comprehensive income (loss)
 
 
 
Balance, beginning of period
	
(4,678	...<br>

**Node ID:** 42a81ec7-57b8-4950-9c70-b1f0e2115d70<br>**Similarity:** 0.247462988<br>**Text:** 245	)
	
(32,696	)
	
(27,385	)
Common stock cash dividends paid
	
(19,800	)
	
(18,135	)
	
(16,521	)
Other, net
	
(1,006	)
	
(863	)
	
(769	)
 
 
 
Net cash used in financing
	
(43,935	)
	
(58,876	)
	
(48,486	)
 
 
 
Investing
 
 
 
Additions to property and equipment
	
(28,107	)
	
(23,886	)
	
(20,622	)
Acquisition of companies, net of cash acquired, and purchases of 
intangible and other assets
	
(1,670	)
	
(22,038	)
	
(8,909	)
Purchases of investments
	
(37,651	)
	
(26,456	)
	
(62,924	)
54<br>

**Node ID:** 9b3ed716-36e5-4f7c-8632-2c8cd7fe29ed<br>**Similarity:** 0.247816443<br>**Text:** Irish Data Protection Commission Matter  
In 2018, the Irish Data Protection Commission (“IDPC”) began investigating a complaint against LinkedIn as to whether 
LinkedIn’s targeted advertising practices violated the recently implemented European Union General Data Protection 
Regulation (“GDPR”). Microsoft cooperated throughout the period of inquiry. In April 2023, the IDPC provided LinkedIn 
with a non-public preliminary draft decision alleging GDPR violations and proposing a fine. Microsoft intends to challenge 
the preliminary draft decision. There is no set timeline for the IDPC to issue a final decision.  
Other Contingencies  
We also are subject to a variety of other claims and suits that arise from time to time in the ordinary course of our 
business. Although management currently believes that resolving claims against us, individually or in aggregate, will not 
have a material adverse impact in our consolidated financial statements, these matters are subject to inherent 
un...<br>

**Node ID:** 765ef8a4-5f6e-4b25-bafd-3d7f543cb827<br>**Similarity:** 0.258326292<br>**Text:** Income Taxes  
As a result of the TCJA, we are required to pay a one-time transition tax on deferred foreign income not previously subject 
to U.S. income tax. Under the TCJA, the transition tax is payable in interest-free installments over eight years, with 8% 
due in each of the first five years, 15% in year six, 20% in year seven, and 25% in year eight. We have paid transition tax 
of $7.7 billion, which included $1.5 billion for fiscal year 2023. The remaining transition tax of $10.5 billion is payable over 
the next three years, with $2.7 billion payable within 12 months.  
In fiscal year 2023, we paid cash tax of $4.8 billion due to the mandatory capitalization for tax purposes of research and 
development expenditures enacted by the TCJA and effective on July 1, 2022.  
Share Repurchases  
During fiscal years 2023 and 2022, we repurchased 69 million shares and 95 million shares of our common stock for 
$18.4 billion and $28.0 billion, respectively, through our share repurchas...<br>

**Node ID:** d4ea6693-c930-4c23-a111-8c63fa962e2a<br>**Similarity:** 0.26702559<br>**Text:** (In millions)
Shares
Amount
Shares
Amount
Shares
Amount
 
 
 
 
 
Year Ended June 30,
	
2023	
	
2022	
	
2021	
 
 
 
 
 
 
 
First Quarter
	
17	  $	
4,600	  	
21	  $	
6,200	 	
25	  $	
5,270	 
Second Quarter
	
20	 	
4,600	 	
20	 	
6,233	 	
27	 	
5,750	
Third Quarter
	
18	 	
4,600	 	
26	 	
7,800	 	
25	 	
5,750	
Fourth Quarter
	
14	 	
4,600	 	
28	 	
7,800	 	
24	 	
6,200	
 
 
 
 
 
 
Total
	
 69	 $	
 18,400	 	
 95	 $	  28,033	 	
 101	 $	
 22,970	
 
 
 
 
 
 
 
8<br>

**Node ID:** 3bf0559b-306a-49ac-ad44-f78f4afff15b<br>**Similarity:** 0.268746376<br>**Text:** (In millions)
Shares
Amount
Shares
Amount
Shares
Amount
 
 
 
 
 
Year Ended June 30,
2023
2022
2021
 
 
 
 
 
 
 
First Quarter
	
17	 $	
4,600	 	
21	 $	
6,200	 	
25	 $	
5,270	
Second Quarter
	
20	 	
4,600	 	
20	 	
6,233	 	
27	 	
5,750	
Third Quarter
	
18	 	
4,600	 	
26	 	
7,800	 	
25	 	
5,750	
Fourth Quarter
	
14	 	
4,600	 	
28	 	
7,800	 	
24	 	
6,200	
 
 
 
 
 
 
Total
	
 69	 $	
 18,400	 	
 95	 $	  28,033	 	
 101	 $	  22,970	
 
 
 
 
 
 
 
98<br>

**Node ID:** 8fa6c118-e6f9-47ed-9f2e-382076920d0a<br>**Similarity:** 0.271399856<br>**Text:** NOTE 18 — EMPLOYEE STOCK AND SAVINGS PLANS  
We grant stock-based compensation to employees and directors. Awards that expire or are canceled without delivery of 
shares generally become available for issuance under the plans. We issue new shares of Microsoft common stock to 
satisfy vesting of awards granted under our stock plans. We also have an ESPP for all eligible employees.  
Stock-based compensation expense and related income tax benefits were as follows:  
  
  
Stock Plans  
Stock awards entitle the holder to receive shares of Microsoft common stock as the award vests. Stock awards generally 
vest over a service period of four years or five years.  
Executive Incentive Plan  
Under the Executive Incentive Plan, the Compensation Committee approves stock awards to executive officers and 
certain senior executives. RSUs generally vest ratably over a service period of four years. PSUs generally vest over a 
performance period of three years. The number of shares the PSU holder ...<br>

**Node ID:** e8e89e33-be23-497b-a264-b863371a26dd<br>**Similarity:** 0.275581598<br>**Text:** STOCK PERFORMANCE  
COMPARISON OF 5 YEAR CUMULATIVE TOTAL RETURN*  
Among Microsoft Corporation, the S&P 500 Index  
and the NASDAQ Computer Index  
  
  
 
  
*	 $100 invested on 6/30/18 in stock or index, including reinvestment of dividends. Fiscal year ending June 30.  
  
 
6/18
6/19
6/20
6/21
6/22
6/23
 
Microsoft Corporation
$	  100.00	 $	  138.07	 $	  212.34	 $	  285.40	 $	  272.82	 $	
 365.24	
S&P 500
	
100.00	 	
110.42	 	
118.70	 	
167.13	 	
149.39	 	
178.66	
NASDAQ Computer
	
100.00	 	
106.10	 	
156.93	 	
236.08	 	
184.53	 	
242.82	
10<br>

Response Synthesis

1. Try a Simple Prompt¶

In [98]:
from llama_index.llms.ollama import Ollama
from llama_index.core import PromptTemplate

llm = Ollama(model="llama3.1")

In [99]:
qa_prompt = PromptTemplate(
    """\
Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query with it starting from the question. 

Query: {query_str}
Answer: \
"""
)

In [100]:
def generate_response(retrieved_nodes, query_str, qa_prompt, llm):
    context_str = "\n\n".join([r.get_content() for r in retrieved_nodes])
    fmt_qa_prompt = qa_prompt.format(
        context_str=context_str, query_str=query_str
    )
    response = llm.complete(fmt_qa_prompt)
    return str(response), fmt_qa_prompt

In [101]:
response, fmt_qa_prompt = generate_response(
    retrieved_nodes, query_str, qa_prompt, llm
)

In [102]:
print(f"*****Response******:\n{response}\n\n")

*****Response******:
| Year Ended June 30 | Shares Repurchased |
| --- | --- |
| 2023 | 69 million |
| 2022 | 95 million |
| 2021 | 101 million |

Note: The table only shows the shares repurchased under the share repurchase programs, and does not include shares repurchased to settle employee tax withholding related to the vesting of stock awards.




In [73]:
print(f"*****Formatted Prompt*****:\n{fmt_qa_prompt}\n\n")

*****Formatted Prompt*****:
Context information is below.
---------------------
In October 2016, the Court 
of Appeals issued its decision adopting the standard advocated by the defendants and remanding the cases to the trial 
court for further proceedings under that standard. The plaintiffs have filed supplemental expert evidence, portions of which 
were stricken by the court. A hearing on general causation took place in September of 2022. In April of 2023, the court 
granted defendants’ motion to strike the testimony of plaintiffs’ experts that cell phones cause brain cancer and entered an 
order excluding all of plaintiffs’ experts from testifying.  
(In millions)
 
 
 
 
 
 
Year Ending June 30,
Operating 
Leases
Finance 
Leases
 
 
 
2024
$	
2,784	
$	
1,747	
2025
	
2,508	
	
2,087	
2026
	
2,142	
	
1,771	
2027
	
1,757	
	
1,780	
2028
	
1,582	
	
1,787	
Thereafter
	
6,327	
	
11,462	
 
 
Total lease payments
	
17,100	
	
20,634	
Less imputed interest
	
(1,963	)
	
(3,567	)
 
 
Total
$	
  