# 1. Data Ingestion

In [2]:
pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [3]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import sentence_transformers
from langchain_core.output_parsers import StrOutputParser
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
load_dotenv()

True

In [5]:

file_path = os.path.join(os.getcwd(), "../data/experimental/CoverLetter.pdf")
loader = PyPDFLoader(file_path)
documents = loader.load()


In [6]:
documents[0].page_content

"Dear Connor, Clark & Lunn Investment Management Hiring Team, \nI am writing to express my strong interest in the Quantitative Developer position within your quantitative equity \nfund. As a final-year student pursuing a double degree in Computer Science from the University of Waterloo \nand Business Administration from Wilfrid Laurier University, I am excited by the opportunity to apply technology \nto drive investment decisions in a fund managing over $38 billion in assets. \nTechnical Excellence with Analytical Depth \nMy computer science background provides programming knowledge and analytical mindset essential for \nquantitative development. Through coursework in Machine Learning, Neural Networks, and Optimization, \ncombined with experience in Python, R, and SQL, I have developed the foundation to work with your \ntechnology stack. My passion for learning new technologies was demonstrated when I mastered React.js \nduring a live PwC project, delivering under tight deadlines. \nAt

In [7]:
documents[0]

Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-09-26T20:56:18-04:00', 'author': 'Shiv C Saraswat', 'moddate': '2025-09-26T20:56:18-04:00', 'source': 'c:\\Users\\shivc\\OneDrive\\Desktop\\DocumentPortalAgent\\notebooks\\../data/experimental/CoverLetter.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content="Dear Connor, Clark & Lunn Investment Management Hiring Team, \nI am writing to express my strong interest in the Quantitative Developer position within your quantitative equity \nfund. As a final-year student pursuing a double degree in Computer Science from the University of Waterloo \nand Business Administration from Wilfrid Laurier University, I am excited by the opportunity to apply technology \nto drive investment decisions in a fund managing over $38 billion in assets. \nTechnical Excellence with Analytical Depth \nMy computer science background provides programming knowledge

In [8]:
textsplitter = RecursiveCharacterTextSplitter(
    chunk_size = 500, 
    chunk_overlap = 150, 
    length_function = len
)

In [9]:
documents = textsplitter.split_documents(documents)

In [10]:
documents[8].metadata

{'producer': 'Microsoft® Word for Microsoft 365',
 'creator': 'Microsoft® Word for Microsoft 365',
 'creationdate': '2025-09-26T20:56:18-04:00',
 'author': 'Shiv C Saraswat',
 'moddate': '2025-09-26T20:56:18-04:00',
 'source': 'c:\\Users\\shivc\\OneDrive\\Desktop\\DocumentPortalAgent\\notebooks\\../data/experimental/CoverLetter.pdf',
 'total_pages': 1,
 'page': 0,
 'page_label': '1'}

# 2. Data Retreival 

In [11]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

In [12]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"  # Fast and efficient
)

In [13]:
vectorStore = FAISS.from_documents(documents, embeddings)

In [14]:
vectorStore.similarity_search('What is the capital of India')

[Document(id='f5c975a6-cddc-454c-93ee-b70b676da83d', metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-09-26T20:56:18-04:00', 'author': 'Shiv C Saraswat', 'moddate': '2025-09-26T20:56:18-04:00', 'source': 'c:\\Users\\shivc\\OneDrive\\Desktop\\DocumentPortalAgent\\notebooks\\../data/experimental/CoverLetter.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='to drive investment decisions in a fund managing over $38 billion in assets. \nTechnical Excellence with Analytical Depth \nMy computer science background provides programming knowledge and analytical mindset essential for \nquantitative development. Through coursework in Machine Learning, Neural Networks, and Optimization, \ncombined with experience in Python, R, and SQL, I have developed the foundation to work with your'),
 Document(id='78ad60b9-7021-4feb-9d16-1bdd27c6d68d', metadata={'producer': 'Microsoft® Word for Microsoft 365', 'cr

In [15]:
retriever  = vectorStore.as_retriever(search_kwargs = {'k' : 10})

In [16]:
retriever.invoke('What is thy name')

[Document(id='be2c0910-8354-48eb-bd5a-bc997f5bf674', metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-09-26T20:56:18-04:00', 'author': 'Shiv C Saraswat', 'moddate': '2025-09-26T20:56:18-04:00', 'source': 'c:\\Users\\shivc\\OneDrive\\Desktop\\DocumentPortalAgent\\notebooks\\../data/experimental/CoverLetter.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='combined with experience in Python, R, and SQL, I have developed the foundation to work with your \ntechnology stack. My passion for learning new technologies was demonstrated when I mastered React.js \nduring a live PwC project, delivering under tight deadlines. \nAt Dare Foods, I developed a Decision Tree Regression model on complex datasets to optimize shelf \nplacement strategies, resulting in an expected 5% sales lift. I also created a novel data integration approach'),
 Document(id='46f7a31e-f96e-4ace-967f-1b5b9ce60422', metadata={'

# 3. Data Generation

In [17]:
from langchain.prompts import PromptTemplate

In [18]:
prompt_template = '''
May I request you to answer the following question based on the provided context and question.
context : {context}
question : {question}
May I additionally request you to provide an appropriate consolation message in case if you are unable to answer the question.
'''


In [19]:
prompt = PromptTemplate(template = prompt_template, input = ['context', 'question'])

## LLM

In [46]:
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
llm = ChatGroq(model = 'deepseek-r1-distill-llama-70b')

In [53]:
def format_docs(documents): 
    '/n/n'.join(docs.page_content for docs in documents)

In [54]:
rag_chain = {'context' : retriever  | format_docs, 'question' : RunnablePassthrough()} | prompt | llm | StrOutputParser()

In [55]:
rag_chain.invoke('What is the name of the company?')

'<think>\nOkay, so I need to figure out the name of the company based on the context provided. But wait, the context is "None." Hmm, that\'s a bit confusing. Let me try to break this down.\n\nFirst, the user is asking for the company\'s name, but there\'s no context given. Without any information, it\'s impossible to determine which company they\'re referring to. Maybe the context was supposed to be provided but wasn\'t, or perhaps it\'s a mistake. \n\nI should consider that the user might have intended to include some details but forgot. Alternatively, they might be testing how I handle situations with missing information. Either way, I can\'t proceed without more data.\n\nI remember that sometimes, when context is missing, it\'s best to politely inform the user that the information isn\'t available and ask for more details. That way, I can assist them better once they provide the necessary context.\n\nSo, I\'ll let them know that without the context, I can\'t find the company\'s name