# 1. Data Ingestion 

## Loading pdfs 

## Converting them into documents

In [2]:
from langchain_community.document_loaders import FileSystemBlobLoader
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import PyPDFParser

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
loader = GenericLoader(
    blob_loader=FileSystemBlobLoader(
        path = '../data/Experimental',
        glob = '*.pdf'
    ), 
    blob_parser = PyPDFParser()
)



In [4]:
documents = loader.load()

In [5]:
documents[1]

Document(metadata={'producer': 'pdfTeX-1.40.24', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-02-04T06:05:08-05:00', 'author': '', 'keywords': '', 'moddate': '2025-10-02T05:03:47-04:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/Arch Linux) kpathsea version 6.3.4', 'subject': '', 'title': '', 'trapped': '/False', 'source': '..\\data\\Experimental\\P1 Spec (1).pdf', 'total_pages': 27, 'page': 0, 'page_label': '1'}, page_content='CS 454/654 Project 1 and 2\nSee LEARN for deadlines and TA office hours.\nRead this document carefully and note the important sections. You will need to refer back \nto this document frequently when implementing P1 and P2.\nVery Important: You will need to dedicate sufficient time for P1 and P2—to design, \nimplement, and most importantly debug your code! A last weekend dash is unlikely \nto allow sufficient time to finish the complete Project, especially P2.\nContents\n1 Introduction 3\n2 WatDFS Overview 4\n3 

In [6]:
# Chunking 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

In [7]:
textsplitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000, 
    chunk_overlap = 150, 
    length_function = len
)

In [8]:
split_documents = textsplitter.split_documents(documents)

## Embedding model

In [9]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"  # Fast and efficient
)

## Ingestion into Vector DB 

In [10]:
from langchain.vectorstores import FAISS

In [11]:
vectorStore = FAISS.from_documents(documents, embeddings)

In [28]:
documents

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-09-26T20:56:18-04:00', 'author': 'Shiv C Saraswat', 'moddate': '2025-09-26T20:56:18-04:00', 'source': '..\\data\\Experimental\\CoverLetter.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content="Dear Connor, Clark & Lunn Investment Management Hiring Team, \nI am writing to express my strong interest in the Quantitative Developer position within your quantitative equity \nfund. As a final-year student pursuing a double degree in Computer Science from the University of Waterloo \nand Business Administration from Wilfrid Laurier University, I am excited by the opportunity to apply technology \nto drive investment decisions in a fund managing over $38 billion in assets. \nTechnical Excellence with Analytical Depth \nMy computer science background provides programming knowledge and analytical mindset essential for \nquantitative development.

# 2. Data Retrieval

In [12]:
retriever = vectorStore.as_retriever(search_kwargs = {'K' : 100})

# 3. Data Generation

In [13]:
prompt_template = '''
May I request you to answer the question based on following context. May I further request you to respond with professional response in case the question might not be answered at the moment
question : {question}
context : {context}
'''

In [14]:
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [15]:
prompt = PromptTemplate(template = prompt_template, input_variables= ['question', 'context'] )

In [22]:
llm = ChatGroq(model = 'llama-3.3-70b-versatile')

In [23]:
def format_docs(documents):
    '\n\n'.join(doc.page_content for doc in documents)

In [24]:
rag_chain = {'question' : RunnablePassthrough(), 'context' : retriever | format_docs} | prompt | llm | StrOutputParser()

In [29]:
rag_chain.invoke('When is Project 1 due?')

"I appreciate your request for a professional response. Unfortunately, I don't have any information regarding Project 1, including its due date, as there is no context or prior knowledge provided. If you could provide more details or clarify the project's scope, I'll be more than happy to assist you. Alternatively, I can suggest possible steps to find the due date, such as checking the project's documentation, contacting the project manager, or reviewing relevant communication channels. Please let me know how I can further assist you."