In [1]:
from dotenv import load_dotenv
import os
import time
import fitz
import tiktoken
import openai
import textwrap
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeEmbeddings, PineconeVectorStore
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.chains import RetrievalQA, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.documents import Document
from langchain import hub

load_dotenv('C:/apis/.env') # path to your dotenv file
pinecone_api_key = os.getenv("PINECONE_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")
def mask_token(token, unmasked_chars=4):
    return token[:unmasked_chars] + '*' * (len(token) - unmasked_chars*2) + token[-unmasked_chars:]
print(mask_token(pinecone_api_key,4))
print(mask_token(openai_api_key,4))

  from tqdm.autonotebook import tqdm


pcsk*******************************************************************FZVA
sk-p************************************************************************************************************************************************************_5sA


### Loading PDF files

In [2]:
class PDFLoader:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path

    def extract_text(self):
        doc = fitz.open(self.pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text

if __name__ == '__main__':
    loader = PDFLoader('pdf/KEYS-form-10k.pdf')
    text = loader.extract_text()
    print(text)

UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
_____________________________________________________________
Form 10-K
_____________________________________________________________
(Mark One)
☒    ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
   For the fiscal year ended October 31, 2024
or
☐    TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
   For the transition period from                 to
Commission File Number: 001-36334
_____________________________________________________________
KEYSIGHT TECHNOLOGIES, INC.
(Exact name of registrant as specified in its charter)
Delaware
 
46-4254555
State or other jurisdiction of
Incorporation or organization
 
I.R.S. Employer
Identification No.
Address of principal executive offices: 1400 Fountaingrove Parkway, Santa Rosa, CA 95403
Registrant's telephone number, including area code: (800) 829-4444
Securities registered pursuant to Section 

### Splitting text into Pinecone documents

In [3]:
def chunk_text_by_tokens(text, chunk_size, encoding_name="cl100k_base"):
    """
    Splits the text into chunks based on the number of tokens.
    """
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(text)
    return [encoding.decode(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)]

def process_text(text, chunk_size=1000):
    """
    Splits the text it into chunks, and generates the embeddings.
    """
    chunks = chunk_text_by_tokens(text, chunk_size)
    return chunks

chunks = process_text(text, chunk_size=800)

In [4]:
# Convert chunks list to Document objects with metadata
docs = [Document(page_content=text, metadata={"chunk": i}) for i, text in enumerate(chunks)]
display(docs)

[Document(metadata={'chunk': 0}, page_content="UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\n_____________________________________________________________\nForm\xa010-K\n_____________________________________________________________\n(Mark One)\n☒\xa0\xa0\xa0\xa0ANNUAL REPORT PURSUANT TO SECTION\xa013 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF\xa01934\n\xa0\xa0\xa0For the fiscal year ended October 31, 2024\nor\n☐\xa0\xa0\xa0\xa0TRANSITION REPORT PURSUANT TO SECTION\xa013 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF\xa01934\n\xa0\xa0\xa0For the transition period from\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0to\nCommission File Number: 001-36334\n_____________________________________________________________\nKEYSIGHT TECHNOLOGIES,\xa0INC.\n(Exact name of registrant as specified in its charter)\nDelaware\n\xa0\n46-4254555\nState or other jurisdiction of\nIncorporation or organization\n\xa0\nI.R.S. Employer\nIdentification No.\nAddress of

### Generating embeddings with Pinecone

In [5]:
model_name = 'multilingual-e5-large'
embeddings = PineconeEmbeddings(
    model=model_name,
    pinecone_api_key=pinecone_api_key
)

In [6]:
pc = Pinecone(api_key=pinecone_api_key)

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)

index_name = "form-10k-docs"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=embeddings.dimension,
        metric="cosine",
        spec=spec
    )
    # Wait for index to be ready
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

pc.Index(index_name).describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'nvda-form-10k': {'vector_count': 194}},
 'total_vector_count': 194}

### Defining the Pinecone vector store

In [7]:
namespace = "keys-form-10k"

docsearch = PineconeVectorStore.from_documents(
    documents=docs,
    index_name=index_name,
    embedding=embeddings,
    namespace=namespace
)

time.sleep(5)
pc.Index(index_name).describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'nvda-form-10k': {'vector_count': 194}},
 'total_vector_count': 194}

In [10]:
pc.Index(index_name).describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'keys-form-10k': {'vector_count': 123},
                'nvda-form-10k': {'vector_count': 194}},
 'total_vector_count': 317}

Direct search on vector space by index:

In [12]:
index = pc.Index(index_name)

for ids in index.list(namespace=namespace):
    query = index.query(
        id=ids[3],
        namespace=namespace,
        top_k=1,
        include_values=False,
        include_metadata=True
    )
    print(query)

{'matches': [{'id': '02c93724-6666-4fc4-b74f-96ad6d62befe',
              'metadata': {'chunk': 103.0,
                           'text': '12)\n'
                                   '(16)\n'
                                   'Net of income tax\n'
                                   'Total reclassifications for the period\n'
                                   '$\n'
                                   '(5)\n'
                                   '$\n'
                                   '(10)\n'
                                   'Net of income tax\n'
                                   '16.\xa0\xa0\xa0\xa0SEGMENT INFORMATION\n'
                                   'Our operating segments were determined '
                                   'based primarily on how the chief operating '
                                   'decision maker views and evaluates our '
                                   'operations. Segment\n'
                                   'operating results are regularly reviewed 

### Using the chatbot

In [13]:
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
retriever=docsearch.as_retriever()

llm = ChatOpenAI(
    openai_api_key=openai_api_key,
    model_name='gpt-4o-mini',
    temperature=0.0
)

combine_docs_chain = create_stuff_documents_chain(
    llm, retrieval_qa_chat_prompt
)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)



Question without context:

In [14]:
query1 = "Keysight net income in 2024"
answer1_without_knowledge = llm.invoke(query1)
wrapper = textwrap.TextWrapper(width=100)
print("Query:\n", wrapper.fill(query1))
print("Answer without knowledge:\n", wrapper.fill(answer1_without_knowledge.content))

Query:
 Keysight net income in 2024
Answer without knowledge:
 I'm sorry, but I don't have access to real-time financial data or projections for specific companies
like Keysight Technologies for the year 2024. For the most accurate and up-to-date financial
information, I recommend checking financial news websites, the company's official investor relations
page, or financial databases.


Question with RAG context:

In [15]:
answer1_with_knowledge = retrieval_chain.invoke({"input": query1})
print("Query 1:", wrapper.fill(query1))
print("Answer with knowledge:\n", wrapper.fill(answer1_with_knowledge['answer']))
print("\n")
print("Context used:")
print(answer1_with_knowledge['context'][0].id)
print(answer1_with_knowledge['context'][0].metadata)
print(answer1_with_knowledge['context'][0].page_content)

Query 1: Keysight net income in 2024
Answer with knowledge:
 Keysight's net income for the year ended October 31, 2024, was $614 million.


Context used:
d2f0b9cd-eb3e-410d-8761-0195d5892528
{'chunk': 40.0}
 Note 11,”Debt,” for
additional information.
Other income (expense) for 2024, 2023 and 2022 was income of $35 million, expense of $25 million, and income of $14 million, respectively, and
primarily include net income related to our defined benefit and post-retirement benefit plans (interest cost, expected return on assets, amortization of net
actuarial loss and prior service credits, and gains (losses) on settlements and curtailments), gains (losses) due to currency and derivative instruments, and the
change in fair value of our equity investments. The increase in net other income for 2024 compared to 2023 was primarily driven by gains on derivative
instruments and higher net gains on our equity investments, partially offset by an increase in pension costs due to higher interest cos

In [19]:
for item in answer1_with_knowledge['context']:
    print(item)

page_content='aries
Notes to the Consolidated Financial Statements
(Continued)
A reconciliation of gross unrecognized tax benefits is as follows:
 
Jan 28, 2024
Jan 29, 2023
Jan 30, 2022
 
(In millions)
Balance at beginning of period
$
1,238 
$
1,013 
$
776 
Increases in tax positions for current year
616 
268 
246 
Increases in tax positions for prior years
87 
1 
14 
Decreases in tax positions for prior years
(148)
(15)
(4)
Settlements
(104)
(9)
(8)
Lapse in statute of limitations
(19)
(20)
(11)
Balance at end of period
$
1,670 
$
1,238 
$
1,013 
Included in the balance of unrecognized tax benefits as of January 28, 2024 are $1.0 billion of tax benefits that would affect our effective tax rate if recognized.
We classify an unrecognized tax benefit as a current liability, or amount refundable, to the extent that we anticipate payment or receipt of cash for income taxes
within one year. The amount is classified as a long-term liability, or reduction of long-term amount refundable, if w

________________________________________________________________

_________________