In [1]:
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
text_docs = PyPDFLoader("../Data/Senior_Associate_Data_Engineering_L1_DS-AI__.pdf").load()
text_docs

[Document(metadata={'producer': 'PDFKit.NET 12.3.320.0 DMV10', 'creator': 'PyPDF', 'creationdate': '2025-06-03T03:42:51-07:00', 'moddate': '2025-06-03T03:46:38-07:00', 'source': '../Data/Senior_Associate_Data_Engineering_L1_DS-AI__.pdf', 'total_pages': 18, 'page': 0, 'page_label': '1'}, page_content='3 June, 2025\nShivang Singh                                        \nC-7, GM Colony, Ballarpur Area\nPost - Sasti, Tehsil - Rajura\nChandrapur\nMaharashtra\n442905\n                                          \nDear Shivang,     \n \nWe are delighted to offer you a position as Senior Associate Data Science L1 with Publicis \nSapient, a division of TLG India Pvt. Ltd (“Publicis Sapient”).\n \nPublicis Sapient will provide you with a total compensation package that consists of your \nbase salary and statutory retirement benefits.\n \nThe Total Cost to Company offered to you is INR 2,000,001.00/-  which includes the \nfollowing:-\no An annualized base salary of INR 1,888,860.00/- (including \nH

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splitted_texts = text_splitter.split_documents(text_docs)
splitted_texts

[Document(metadata={'producer': 'PDFKit.NET 12.3.320.0 DMV10', 'creator': 'PyPDF', 'creationdate': '2025-06-03T03:42:51-07:00', 'moddate': '2025-06-03T03:46:38-07:00', 'source': '../Data/Senior_Associate_Data_Engineering_L1_DS-AI__.pdf', 'total_pages': 18, 'page': 0, 'page_label': '1'}, page_content='3 June, 2025\nShivang Singh                                        \nC-7, GM Colony, Ballarpur Area\nPost - Sasti, Tehsil - Rajura\nChandrapur\nMaharashtra\n442905\n                                          \nDear Shivang,     \n \nWe are delighted to offer you a position as Senior Associate Data Science L1 with Publicis \nSapient, a division of TLG India Pvt. Ltd (“Publicis Sapient”).\n \nPublicis Sapient will provide you with a total compensation package that consists of your \nbase salary and statutory retirement benefits.\n \nThe Total Cost to Company offered to you is INR 2,000,001.00/-  which includes the \nfollowing:-\no An annualized base salary of INR 1,888,860.00/- (including \nH

In [5]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
vector_db = Chroma.from_documents(splitted_texts, embedding_model, persist_directory="chroma_index")
vector_db

<langchain_chroma.vectorstores.Chroma at 0x133941540>

In [6]:
#As our vector db is created we can now query it
query = "What are the responsibilities of a Senior Associate in Data Engineering?"
results = vector_db.similarity_search(query, k=3)
for result in results:
    print(result.page_content)
    print("\n---\n")

Employee 
Name Shivang Singh  Level/Designation Senior Associate 
Data Science L1
Salary Structure
S.No. Components Components Details Monthly Annual
A  Fixed
1 Basic Salary 35% of Base Salary INR 55,092.00/- INR 661,101.00 /-
B  Flexible
2 House Rent 
Allowance
Can be fixed at 10%  or 
21% of Base Salary INR 33,055.00/- INR 396,661.00 /-
3 Leave Travel 
Assistance
Default amount will  be 
set as zero or can  be 
claimed once a  year as 
per  entitlement 
per  career stage
0 0
   Special Allowance
4 Special 
Allowance
Base Salary - ( 
Fixed  elements + 
Flexible  Element + 
Optional  Benefits)
INR 69,258.00/- INR 831,098.00/-
  Optional Benefits
5 Meal Coupons
Tax rebate 
on  (optional)  subscripti
on to meal  benefit 
which can be  availed at 
INR 2200  per month. 
Default  amount will be 
set as zero.
0 0
Docusign Envelope ID: 7BE7AC8E-8D41-4224-976D-4AE0B0CB4614

---

Employee 
Name Shivang Singh  Level/Designation Senior Associate 
Data Science L1
Salary Structure
S.No. Components 

In [7]:
#load the vector db from the disk
vector_db = Chroma(persist_directory="chroma_index", embedding_function=embedding_model)
# Now we can perform a similarity search again
results = vector_db.similarity_search(query, k=3)
for result in results:
    print(result.page_content)
    print("\n---\n")

Employee 
Name Shivang Singh  Level/Designation Senior Associate 
Data Science L1
Salary Structure
S.No. Components Components Details Monthly Annual
A  Fixed
1 Basic Salary 35% of Base Salary INR 55,092.00/- INR 661,101.00 /-
B  Flexible
2 House Rent 
Allowance
Can be fixed at 10%  or 
21% of Base Salary INR 33,055.00/- INR 396,661.00 /-
3 Leave Travel 
Assistance
Default amount will  be 
set as zero or can  be 
claimed once a  year as 
per  entitlement 
per  career stage
0 0
   Special Allowance
4 Special 
Allowance
Base Salary - ( 
Fixed  elements + 
Flexible  Element + 
Optional  Benefits)
INR 69,258.00/- INR 831,098.00/-
  Optional Benefits
5 Meal Coupons
Tax rebate 
on  (optional)  subscripti
on to meal  benefit 
which can be  availed at 
INR 2200  per month. 
Default  amount will be 
set as zero.
0 0
Docusign Envelope ID: 7BE7AC8E-8D41-4224-976D-4AE0B0CB4614

---

Employee 
Name Shivang Singh  Level/Designation Senior Associate 
Data Science L1
Salary Structure
S.No. Components 

In [8]:
##Lets use this vectordb as retriever also
retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retriever.invoke(query)[0].page_content

'Employee \nName Shivang Singh  Level/Designation Senior Associate \nData Science L1\nSalary Structure\nS.No. Components Components Details Monthly Annual\nA  Fixed\n1 Basic Salary 35% of Base Salary INR 55,092.00/- INR 661,101.00 /-\nB  Flexible\n2 House Rent \nAllowance\nCan be fixed at 10%  or \n21% of Base Salary INR 33,055.00/- INR 396,661.00 /-\n3 Leave Travel \nAssistance\nDefault amount will  be \nset as zero or can  be \nclaimed once a  year as \nper  entitlement \nper  career stage\n0 0\n   Special Allowance\n4 Special \nAllowance\nBase Salary - ( \nFixed  elements + \nFlexible  Element + \nOptional  Benefits)\nINR 69,258.00/- INR 831,098.00/-\n  Optional Benefits\n5 Meal Coupons\nTax rebate \non  (optional)  subscripti\non to meal  benefit \nwhich can be  availed at \nINR 2200  per month. \nDefault  amount will be \nset as zero.\n0 0\nDocusign Envelope ID: 7BE7AC8E-8D41-4224-976D-4AE0B0CB4614'