# RAG Application - POC

### Enter your query

In [1]:
query_text = input("Ask anything about our final year project")
query_text

Ask anything about our final year project What are the components used in this project ?


'What are the components used in this project ?'

### Import LangChain libraries

In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms import Ollama

### Import utility functions

In [3]:
from keyword_generator import extract_keywords
from db import get_db_collection, add_to_collection, query_collection

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\impostor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\impostor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Chroma DB connected


  from tqdm.autonotebook import tqdm, trange


Embedding function loaded


## Load pdf document and load it into Vector Database

In [4]:
file_path = (
    "docs/project-report.pdf"
)
loader = PyPDFLoader(file_path)
document = loader.load()
print("No. of pages in the document:", len(document))

No. of pages in the document: 23


#### Split pages into chunks of texts

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunked_documents = text_splitter.split_documents(document)

#### Prepare data for indexing
- Generate Unique Id for individual chunks
- Generate keywords for metadata using NLP

In [6]:
contents = []
ids = []
keywords = []

page_no = 0
c_index = -1
for index, doc in enumerate(chunked_documents):
    metadata = doc.metadata
    source = metadata['source'].replace('/','-').replace('.','-')

    if metadata['page'] > page_no:
        c_index = 0
    else:
        c_index += 1

    page_no = metadata['page']
    
    chunk_id = f"{source}-p{page_no}-c{c_index}"

    contents.append(doc.page_content)
    ids.append(chunk_id)
    keywords.append(extract_keywords(doc.page_content))
    print("Processed chunk:", chunk_id)

Processed chunk: docs-project-report-pdf-p0-c0
Processed chunk: docs-project-report-pdf-p1-c0
Processed chunk: docs-project-report-pdf-p1-c1
Processed chunk: docs-project-report-pdf-p2-c0
Processed chunk: docs-project-report-pdf-p2-c1
Processed chunk: docs-project-report-pdf-p3-c0
Processed chunk: docs-project-report-pdf-p4-c0
Processed chunk: docs-project-report-pdf-p5-c0
Processed chunk: docs-project-report-pdf-p6-c0
Processed chunk: docs-project-report-pdf-p6-c1
Processed chunk: docs-project-report-pdf-p6-c2
Processed chunk: docs-project-report-pdf-p7-c0
Processed chunk: docs-project-report-pdf-p7-c1
Processed chunk: docs-project-report-pdf-p7-c2
Processed chunk: docs-project-report-pdf-p8-c0
Processed chunk: docs-project-report-pdf-p8-c1
Processed chunk: docs-project-report-pdf-p9-c0
Processed chunk: docs-project-report-pdf-p10-c0
Processed chunk: docs-project-report-pdf-p10-c1
Processed chunk: docs-project-report-pdf-p10-c2
Processed chunk: docs-project-report-pdf-p11-c0
Processed

### Create a collection in Chroma DB

In [7]:
COLLECTION_NAME = "my_project"
collection = get_db_collection(COLLECTION_NAME)

metadata = [{"tags": ", ".join(i) } for i in keywords]
add_to_collection(collection, contents, ids, metadata)

Add of existing embedding ID: docs-project-report-pdf-p0-c0
Add of existing embedding ID: docs-project-report-pdf-p1-c0
Add of existing embedding ID: docs-project-report-pdf-p1-c1
Add of existing embedding ID: docs-project-report-pdf-p2-c0
Add of existing embedding ID: docs-project-report-pdf-p2-c1
Add of existing embedding ID: docs-project-report-pdf-p3-c0
Add of existing embedding ID: docs-project-report-pdf-p4-c0
Add of existing embedding ID: docs-project-report-pdf-p5-c0
Add of existing embedding ID: docs-project-report-pdf-p6-c0
Add of existing embedding ID: docs-project-report-pdf-p6-c1
Add of existing embedding ID: docs-project-report-pdf-p6-c2
Add of existing embedding ID: docs-project-report-pdf-p7-c0
Add of existing embedding ID: docs-project-report-pdf-p7-c1
Add of existing embedding ID: docs-project-report-pdf-p7-c2
Add of existing embedding ID: docs-project-report-pdf-p8-c0
Add of existing embedding ID: docs-project-report-pdf-p8-c1
Add of existing embedding ID: docs-proje

Documents loaded to DB


### Chunks retreived from the DB

In [8]:
query_result = query_collection(collection, query_text)
query_result

{'ids': [['docs-project-report-pdf-p2-c0',
   'docs-project-report-pdf-p10-c1',
   'docs-project-report-pdf-p14-c0']],
 'distances': [[0.43664721314807164, 0.437354370698935, 0.4389148767703823]],
 'metadatas': [[{'tags': 'machine, medicine, motor, vending, 11'},
   {'tags': 'machine, dispense, product, date, expiry'},
   {'tags': 'arduino, controller, display, characters, interfaced'}]],
 'embeddings': None,
 'documents': [['Certificate  i  \nAbstract  ii  \nList of Figures  iii  \nList of Tables  iv  \n1. INTRODUCTION  1- 4  \n      1.1. Field of Invention  2  \n      1.2. Background of Invention  2  \n      1.3. Scope  3  \n      1.4. Objectives  4  \n2. LITERATURE REVIEW  5-7  \n      2.1. Implementation of FSM Based Automatic Dispense Machine \nwith Expiry Date Feature Using VHDL    5  \n      2.2. Steven Woodbine, The Complete Vending Machine, Published \non 18 May 2011    6  \n      2.3. Design and fabrication of touch screen based automated \nmedical Vending machine  6  \n     

### Prepare final prompt to give to LLM model

In [9]:
text = ""
for doc in query_result['documents']:
    for i in doc:
        text += i

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
).format(context=text)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
final_prompt = prompt.format(input=query_text)
final_prompt

"System: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise.\n\nCertificate  i  \nAbstract  ii  \nList of Figures  iii  \nList of Tables  iv  \n1. INTRODUCTION  1- 4  \n      1.1. Field of Invention  2  \n      1.2. Background of Invention  2  \n      1.3. Scope  3  \n      1.4. Objectives  4  \n2. LITERATURE REVIEW  5-7  \n      2.1. Implementation of FSM Based Automatic Dispense Machine \nwith Expiry Date Feature Using VHDL    5  \n      2.2. Steven Woodbine, The Complete Vending Machine, Published \non 18 May 2011    6  \n      2.3. Design and fabrication of touch screen based automated \nmedical Vending machine  6  \n      2.4. ATM (All Time Medicine) counter for medicine  7  \n3. METHODOLOGY  8-13  \n3.1. BLOCK DIAGRAM  8  \n3.2. LCD Display  9  \n3.3.  Arduino Micro Controller  9  \n3.4.  Servo Motor  

### Connect to local LLM, I'm using phi-3 from Microsoft

In [10]:
llm = Ollama(
    model="phi3",
    keep_alive=-1,
    format="json"
)

### Final output from the LLM using the context

In [11]:
llm.invoke(final_prompt)

'{"answer": "The components used in this project include an Arduino Micro Controller, LCD Display, Servo Motor, Stepper Motor Driver A4988, Stepper Motor NEMA 17, DC-DC LM2596 Buck Converter, and a Lithium Polymer Battery."}'