In [3]:
print('Data')

Data


In [5]:
from pypdf import PdfReader

#Reading the PDF file
reader = PdfReader("data\microsoft-annual-report.pdf")
pdf_text = [p.extract_text().strip() for p in reader.pages]

#filtering out empty strings
pdf_text = [text for text in pdf_text if text]

  reader = PdfReader("data\microsoft-annual-report.pdf")


In [9]:
#splitting the text into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

character_splitter  = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

character_splitter_texts = character_splitter.split_text("\n\n".join(pdf_text))

In [18]:
len(character_splitter_texts)

383

In [19]:
token_splitter = SentenceTransformersTokenTextSplitter(
    chunk_size=256,
    chunk_overlap=0
)

token_split_texts = []
for text in character_splitter_texts:
    token_split_texts += token_splitter.split_text(text)


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [20]:
len(token_split_texts)

384

In [36]:
#Initializing chromadb
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction()


In [38]:
# we then instantiate the Chroma client and create a collection called "microsoft-collection"
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection(
    "microsoft-collection", embedding_function=embedding_function
)

InternalError: Collection [microsoft-collection] already exists

In [39]:
# extract the embeddings of the token_split_texts
ids = [str(i) for i in range(len(token_split_texts))]

chroma_collection.add(ids=ids, documents=token_split_texts)
chroma_collection.count()


KeyboardInterrupt: 

In [42]:
#Example passing a query to the collection and checking the retrieval process
query = "What was the total revenue for the year?"

results = chroma_collection.query(query_texts=[query], n_results=5)
retrieved_documents = results["documents"][0]
retrieved_documents 


['segment revenue and operating income were as follows during the periods presented : no sales to an individual customer or country other than the united states accounted for more than 10 % of revenue for fiscal years 2023, 2022, or 2021. revenue, classified by the major geographic areas in which our customers were located, was as follows : ( a ) includes billings to oems and certain multinational organizations because of the nature of these businesses and the impracticability of determining the geographic source of the revenue. ( in millions ) year ended june 30, 2023 2022 2021 revenue productivity and business processes $ 69, 274 $ 63, 364 $ 53, 915 intelligent cloud 87, 907 74, 965 59, 728 more personal computing 54, 734 59, 941 54, 445 total $ 211, 915 $ 198, 270 $ 168, 088 operating income productivity and business processes $ 34, 189 $ 29, 690 $ 24, 351 intelligent cloud 37, 884 33, 203 26, 471',
 '( in millions, except percentages ) 2023 2022 percentage change sales and marketin

In [43]:
#Initating the LLM and passing the query to it to get the llm response as an hallucinated response
from dotenv import load_dotenv
import os

load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

#initializing the llm
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash",
                            google_api_key=GOOGLE_API_KEY , 
                            temperature=0.7)


In [None]:
response = llm.generate(
    query=query,
    documents=retrieved_documents,
    max_output_tokens=256
)

In [94]:
def generate_multi_query(query, llm = llm):
    system_prompt = """
    You are a knowledgeable financial research assistant. 
    Your users are inquiring about an annual report. 
    Provide an example answer to the given question, that might be found in a document like an annual report..Since this promtp is using forquery expansion function in rag system,
    """

    # Combine the system prompt + user query
    final_prompt = f"{system_prompt}\n\nUser question: {query}"

    # Use LLM to generate output
    response = llm.invoke(final_prompt)

    # 'response' is usually just a string (if using LangChain wrappers)
    content = response.content
    return content



In [95]:
original_query = "What details can you provide about the factors that led to revenue growth?"

aug_queries = generate_multi_query(original_query,llm)

print(aug_queries)


Our revenue growth in fiscal year 2023, a 15% increase to $1.2 billion compared to $1.04 billion in fiscal year 2022, can be attributed to several key factors:

* **Strong Demand for Product X:**  Increased market demand for our flagship product, Product X, contributed significantly to revenue growth.  This was driven by a successful marketing campaign targeting a new demographic segment (Millennials), coupled with positive customer reviews and increased brand awareness.  Sales of Product X increased by 22% year-over-year.

* **Successful Launch of Product Y:** The launch of Product Y in Q3 2023 exceeded expectations, contributing $100 million in revenue during its first quarter.  This new product line successfully tapped into an underserved market segment and complemented our existing offerings.

* **Expansion into New Geographic Markets:** Our strategic expansion into the European market, initiated in Q2 2023, yielded positive results, contributing an additional 5% to overall revenue

In [96]:

# 2. concatenate the original query with the augmented queries
joint_query =  original_query + aug_queries  # original query is in a list because chroma can actually handle multiple queries, so we add it in a list

# print("======> \n\n", joint_query)
joint_query

'What details can you provide about the factors that led to revenue growth?Our revenue growth in fiscal year 2023, a 15% increase to $1.2 billion compared to $1.04 billion in fiscal year 2022, can be attributed to several key factors:\n\n* **Strong Demand for Product X:**  Increased market demand for our flagship product, Product X, contributed significantly to revenue growth.  This was driven by a successful marketing campaign targeting a new demographic segment (Millennials), coupled with positive customer reviews and increased brand awareness.  Sales of Product X increased by 22% year-over-year.\n\n* **Successful Launch of Product Y:** The launch of Product Y in Q3 2023 exceeded expectations, contributing $100 million in revenue during its first quarter.  This new product line successfully tapped into an underserved market segment and complemented our existing offerings.\n\n* **Expansion into New Geographic Markets:** Our strategic expansion into the European market, initiated in Q2

In [97]:

results = chroma_collection.query(
    query_texts=joint_query, n_results=5, include=["documents", "embeddings"]
)
retrieved_documents = results["documents"]

flat_documents = [item for sublist in retrieved_documents for item in sublist]

# Join the list into a single string
retrieved_text = " ".join(flat_documents)

In [98]:
def generate_response(retrieved_text):
    retrieved_text = "\n\n".join(retrieved_text)
    prompt2 = (
        "YYou are a knowledgeable financial research assistant for question-answering tasks. Use the following pieces of "
        "retrieved context to answer the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the answer concise."
        "\n\nretrieved Context:\n" + retrieved_text + "\n\nQuestion:\n" + original_query
    )

    response = llm.invoke(prompt2)  # LangChain's way
    return response.content

In [99]:
final_response = generate_response(retrieved_text)

In [101]:
print(retrieved_text)

trade shows, seminars, and other programs. fiscal year 2023 compared with fiscal year 2022 sales and marketing expenses increased $ 934 million or 4 % driven by 3 points of growth from the nuance and xandr acquisitions and investments in commercial sales, offset in part by a decline in windows advertising. sales and marketing included a favorable foreign currency impact of 2 %. general and administrative general and administrative expenses include payroll, employee benefits, stock - based compensation expense, employee severance expense incurred as part of a corporate program, and other headcount - related expenses associated with ( in millions, except percentages ) 2023 2022 percentage change research and development $ 27, 195 $ 24, 512 11 % as a percent of revenue 13 % 12 % 1ppt ( in millions, except percentages ) 2023 2022 percentage change sales and marketing $ 22, 759 $ 21, 825 4 % as a percent of revenue 11 % 11 % 0ppt segment results of operations reportable segments fiscal year