This code demonstrates creating a RAG pipeline. Following steps are included:
1. Loading a .pdf file
2. Use embedding model and LLM from OpenAI
3. Retriveing response from FAISS vector database
4. Augmenting and generating the response

In [None]:
!pip install langchain_openai
!pip install langchain_community
!pip install langchain_core
#!pip install lan



In [None]:
#!pip install --upgrade langchain



In [None]:
# Force-reinstall langchain to resolve persistent ModuleNotFoundError for langchain.memory
#!pip install --upgrade --force-reinstall langchain

# import necessary packages
import os
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
#from langchain.memory import ConversationBufferMemory
#from langchain_core.memory import BaseMemory
#from langchain_community.memory import ConversationBufferMemory
from langchain_classic.memory import ConversationBufferMemory

In [None]:
# read the OpenAI API key from environment variable
from google.colab import userdata
openai_api_key = userdata.get("OPENAI_API_KEY").strip()

In [None]:
!pip install pypdf
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.7


In [None]:
# load the pdf file
file_path = "company_policy.pdf"
loader = PyMuPDFLoader(file_path)
documents = loader.load()
#documents

In [None]:
# print number of documents loaded (24 pages in the pdf document)
print("Number of documents loaded: ", len(documents))

Number of documents loaded:  24


In [None]:
documents[0]

Document(metadata={'producer': 'www.ilovepdf.com', 'creator': 'Microsoft® Word 2016', 'creationdate': '2020-08-26T06:56:00+00:00', 'author': 'hr', 'moddate': '2020-08-26T06:56:00+00:00', 'source': 'company_policy.pdf', 'total_pages': 24, 'page': 0, 'page_label': '1'}, page_content='SPIL Corporate HR Policies  \n \n \nSIRCA PAINTS INDIA LTD \nNEW DELHI  \n \n \n \n \nCORPORATE  \n  HUMAN RESOURCES \nPOLICIES & MANUALS')

In [None]:
documents[0].page_content

'SPIL Corporate HR Policies  \n \n \nSIRCA PAINTS INDIA LTD \nNEW DELHI  \n \n \n \n \nCORPORATE  \n  HUMAN RESOURCES \nPOLICIES & MANUALS'

In [None]:
# divide the document into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)
document_chunks = text_splitter.split_documents(documents)
print("Number of chunks: ", len(document_chunks))

# print the third, fifth and seventh chunk
print("\n**************Third chunk:**************\n")
print(document_chunks[3])
print("\n**************Fifth chunk:**************\n")
print(document_chunks[5])
print("\n**************Seventh chunk:**************\n")
print(document_chunks[7])

Number of chunks:  122

**************Third chunk:**************

page_content='Resources Department of SPIL.  
 
Applicability  
 
This EHB will be applicable to the employees working in Sirca Paints India Limited (SPIL) w.e.f 
August 21, 2020. This book contains all the notices/circulars/extracts/meetings circulated earlier 
before the date of validity of this handbook.  
 
Definitions  
 
a) “Company” means Sirca Paints India Limited (SPIL) and will its branches, offices/plants located 
anywhere in India or Abroad.' metadata={'producer': 'www.ilovepdf.com', 'creator': 'Microsoft® Word 2016', 'creationdate': '2020-08-26T06:56:00+00:00', 'source': 'company_policy.pdf', 'file_path': 'company_policy.pdf', 'total_pages': 24, 'format': 'PDF 1.5', 'title': '', 'author': 'hr', 'subject': '', 'keywords': '', 'moddate': '2020-08-26T06:56:00+00:00', 'trapped': '', 'modDate': 'D:20200826065600Z', 'creationDate': "D:20200826065600+00'00'", 'page': 1}

**************Fifth chunk:**************

pa

In [None]:
# create embeddings for the chunks
embeddings = OpenAIEmbeddings(api_key=openai_api_key)

# print the model used for embeddings
print(embeddings.model)

text-embedding-ada-002


In [None]:
!pip install faiss-cpu



In [None]:
# create FAISS vector store from the list of `Document` object and embeddings
vector_store=FAISS.from_documents(document_chunks, embeddings)
# save the faiss index to disk
vector_store.save_local("fais_index")

In [None]:
# memory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")

In [None]:
# initialize GPT4 turbo via langchain
llm = ChatOpenAI(model="gpt-4-turbo", api_key=openai_api_key.strip())

In [None]:
!pip install langchain



In [None]:
from langchain_classic.chains import RetrievalQA
from langchain_classic.chains import ConversationalRetrievalChain

In [None]:
# create a ConversationalRetrievalChain chain
# as_retriever() converts vector store into a retriever object
retriever = vector_store.as_retriever(search_kwargs={"k":3})
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    memory=memory,
    retriever=retriever,
    return_source_documents=True,
)

In [None]:
query1 = "What does the policy say about attendance and timing?"
result1 = qa_chain({"question":query1})
print("Question 1:\n")
print(f'Human: {query1}')
print(f'AI: {result1["answer"]}')

query2 = "Can they come late?"
result2 = qa_chain({"question":query2})
print("Question 2:\n")
print(f'Human: {query2}')
print(f'AI: {result2["answer"]}')

query3 = "To whom can they submit attendance reporting?"
result3 = qa_chain({"question":query3})
print("Question 3:\n")
print(f'Human: {query3}')
print(f'AI: {result3["answer"]}')

Question 1:

Human: What does the policy say about attendance and timing?
AI: The SPIL Corporate HR policies specify the following regarding attendance and timing:

1. **Flexi-Timing Options**: Employees can choose between two timing options:
   - 9:30 AM to 6:00 PM
   - 10:00 AM to 6:30 PM

2. **Grace Period**: There is a 15-minute grace period for late arrivals. If an employee arrives beyond this grace period, leave will be deducted.

3. **Extreme Late Arrivals**: Employees are allowed to come late up to 11:45 AM based on extreme emergencies.

4. **Daily Time Report (DTR)**: Employees in the Marketing Department must submit a Daily Time Report to their Reporting Officers. Attendance is marked based on this report, and the Reporting Officers inform the HR Department about the attendance of their respective team members or subordinates.

5. **Weekends and Public Holidays**: Employees need to take approval from their Reporting Officers, Director, or Managing Director to work on weekends

In [None]:
# perform similarity search
search_query = "What is the notice period?"
similar_docs = vector_store.similarity_search(search_query, k=3)

# print the results
for i, doc in enumerate(similar_docs):
  print(f"\n**********Page Content: Document {i+1}**********\n")
  print(doc.page_content)
  print(f"Metadata: {doc.metadata}")


**********Page Content: Document 1**********

SPIL Corporate HR Policies  
 
 
g) For Transferee, the clause of employment, policies and procedures of new company will be 
applicable. The proper appointment letter and other joining material will be given.  
Section 10:  Notices Period 
a) On Probation, the employee can resigns from the services by giving 10 days of notice.  
 
b) On Confirmation, the employee can resign from the services by giving the notice of 30 days 
(i.e. one month).
Metadata: {'producer': 'www.ilovepdf.com', 'creator': 'Microsoft® Word 2016', 'creationdate': '2020-08-26T06:56:00+00:00', 'author': 'hr', 'moddate': '2020-08-26T06:56:00+00:00', 'source': 'company_policy.pdf', 'total_pages': 24, 'page': 11, 'page_label': '12'}

**********Page Content: Document 2**********

month notice is to be given to the employee and if required then based upon the re -evaluation 
from the superiors of the Department, the employee shall be revoke from the organization or else 
the