In [45]:
# Load the documents

import os
from langchain_community.document_loaders import PyMuPDFLoader
import spacy
from langchain_text_splitters.spacy import SpacyTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
import torch
import warnings
from tqdm.notebook import tqdm

warnings.filterwarnings("ignore")


In [46]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [47]:
device

device(type='cuda')

In [48]:
pdf_file_path = r'E:\Others\document_assistant\VWAccessories Catalogue_2020_spread.pdf'

In [49]:
def pdf_loader(pdf_path):
    loader = PyMuPDFLoader(pdf_path)
    document = loader.load()
    return document

In [50]:
#Splitting the text from documents

In [51]:
document = pdf_loader(pdf_file_path)
print(document[4].page_content)

→  Volkswagen Genuine hub caps1
Volkswagen Genuine hub caps with an embossed
Volkswagen logo impress with their dynamic 
spinning function, which guarantees that the 
Volkswagen logo remains in the perfect position 
while driving and in any parking situation.
1	 Use of the hub cap depends on the wheel.  
	 Please ask your Volkswagen dealership.
8 |
| 9
01   LED tail light with animated indicator light
     
• Premium Volkswagen Genuine quality
• Exclusive tinted design
• For easy replacement or retrofitting
• 1 set, consisting of four LED tail lights,
  adapter for the electrical installation kit and
  installation instructions with coding instructions.
Part number: 5H1.052.200 - LHD
Part number: 5H2.052.200 - RHD
02   Protective strip for the tailgate
The protective strip for the tailgate with its C hrome
look lends the vehicle a refined and elegant
appearance. Quick and easy attachment to the lower
edge of the tailgate. Also acts as an edge protector.
01
02



In [52]:
text_splitter = SpacyTextSplitter(pipeline='sentencizer')
doc_split = text_splitter.split_documents(document)

In [53]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100)
doc_split = text_splitter.split_documents(document)

In [55]:
doc_split[7].page_content

'| 13\nPanel trim protective film\nProtected\nat any time\nLooks good and protects: \nThe black and precisely cut \nsill protector film with silver \ndecorative stripes protects \nagainst scratches around \nthe door sill. Easy to install.\n01   Textile floor mats front and rear, Black,\n\t\nPremium with Golf branding, left-hand drive\n\t\nPart number: 5H1061270 WGK\n02   Textile floor mats front and rear, Black,\n\t\n Premium with Golf branding, right-hand drive\n\t\n Part number: 5H2061270 WGK\n03   Textile floor mats front and rear, Black, Optimat\n\t\n Part number: 5H1061445 WGK\n04   Textile floor mats front and rear, "Plus",\n\t\n Black, left-hand drive\n\t\n Part number: 5G1061404 WGK\n05   All-weather floor mats front and rear,\n\t\n Titanium Black, left-hand drive\n\t\n Part number: 5H1061500 82V\n06   All-weather floor mats front and rear,\n\t\n Titanium Black, left-hand drive\n\t\n Part number: 5H1061500 82V\n07   All-weather floor mats rear, Titanium Black\n\t\nPart number: 

In [56]:
#Embedding
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document as LangchainDocument

In [57]:
model_name = 'sentence-transformers/all-mpnet-base-v2'
model_kwargs = {'device': 'cuda'}
embeddings = HuggingFaceEmbeddings(model_name=model_name,model_kwargs=model_kwargs)

In [58]:
embed_docs  = embeddings.embed_documents([doc.page_content for doc in tqdm(doc_split)])


  0%|          | 0/52 [00:00<?, ?it/s]

In [59]:
len(embed_docs)

52

In [60]:
sample_query = "What is the ideal BP for the humans?"
query_embed = embeddings.embed_query(sample_query)

In [61]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

In [62]:
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [63]:
doc_split[0]

Document(metadata={'source': 'E:\\Others\\document_assistant\\VWAccessories Catalogue_2020_spread.pdf', 'file_path': 'E:\\Others\\document_assistant\\VWAccessories Catalogue_2020_spread.pdf', 'page': 0, 'total_pages': 33, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe InDesign 15.0 (Macintosh)', 'producer': 'Adobe PDF Library 15.0', 'creationDate': "D:20201213133913+04'00'", 'modDate': "D:20210217151128+02'00'", 'trapped': ''}, page_content='| 01\nAccessories Catalogue \nFor Passenger Cars')

In [64]:
from uuid import uuid4

docs = [doc.page_content for doc in doc_split]

uuids = [str(uuid4()) for _ in range(len(doc_split))]
vector_store.add_documents(documents=doc_split, ids=uuids)

['8b004a68-b7ff-4563-8a9b-fad5da839089',
 'da51e3dd-d567-40f8-9a20-bb8cbec5221b',
 'ca8f8d58-9d42-42d8-83eb-69d480ff8807',
 'a5284797-064b-43f3-ae8c-5c550bc6da5d',
 'e59acf2a-8163-405b-868b-d374edbd6917',
 'a56ed361-4bba-4104-87e6-48c6990a59a3',
 'defd1e5e-5038-4875-a2e2-928ecd3fc9e0',
 '4a9eb8b5-d9d0-47e8-bb4a-6ceaac76ffda',
 'e045e4f6-9655-412b-9242-1d706e3d5b74',
 'eb014f9e-53de-40d7-afca-262e3fecdd74',
 '304a7b80-5742-4c8f-9a5b-cb02f4b8c659',
 '665409a7-f28a-47c0-9563-896056795b8f',
 'da9d1f2d-0d38-406b-bd32-c1d4ca007b43',
 'eefd9c6f-e4c2-4387-8ce6-3e46a790581d',
 '01794446-730e-4008-a11c-b16376ac4096',
 'c1138923-9424-4050-9203-23f601c0f01c',
 '0ce838d6-45ed-4108-9a53-c4cdcdb15d3f',
 'b8c41e5f-197c-4a5c-9ea0-aa0efc006e68',
 '8e75e9de-7a6c-4c8a-9189-05770fafbee9',
 'dcc220f0-f9b8-41bc-af35-dd70d904b447',
 '92fe6011-2cf1-49a8-ac1e-06977f880752',
 'f8534dd9-3e2d-41db-8f2a-ceb8a3765ce1',
 '354b068c-bba9-4155-b295-0b0e61f5f600',
 '8b86e2a8-fe6d-464b-b4be-33e5aeac35fc',
 'c6c0df5c-0ae5-

In [65]:
results = vector_store.similarity_search(
    "Does the car has camera?",
    k=3)

for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* 01    Universal Traffic Recorder – 
front and rear
The 2-camera system records what 
happens in front of and behind the 
vehicle in the event of damage. It is 
installed on the windscreen and 
rear window in a visually appealing 
way and firmly glued to the 
windscreen/window. 
Part number: 000.063.511.E
02    Cradle for Mobile
The universal smartphone holder 
keeps a tight grip on your mobile 
with its 70mm diameter suction 
cup which provides an incredibly 
strong bond to the windscreen or 
any other smooth surface. 
Part number: 000.051.435.AG
03    USB extension
The 2216 USB extension cable is particularly
suited to use with the Volkswagen CarStick,
and can also be used to connect other USB-
connectable products to the infotainment
system. 
Part number: 000.051.446.T
04    Reading light
The LED reading light from Volkswagen
Accessories allows passengers to also have
light at the back of the car when it’s dark
outside. The universally usable LED reading [{'source': 'E:\\Others\\do

In [66]:
results = vector_store.similarity_search_with_score(
    "What is the torque?", k=2
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=1.241033] rating (kg): 1,810
Snow chains servo - SUV
Tyre size: 255/60 R18,
255/55 R19
Scope of supply: 2
pieces, incl. storage bag
Quality without
compromise
With the light alloy rims and complete wheels for the new Touareg, there are a large number of options, and 
variety in design with no compromises on quality and functionality.
The Volkswagen Genuine light alloy wheels
• Strength tests check the strains on the wheel under excess load, shocks and cornering.
• Material and surface are tested for tensile strength and elongation as well as resistance to scratches
  and corrosion.
• Vehicle tests check the complete wheels under rigorous conditions on the test track.
• The endurance test on the two-axle wheel inspection rig simulates high loads.
The fully automatic mounting of complete wheels takes place on state-of-the-art production systems.
Wheel and tyres are combined to produce a high-quality unit. Precise production under temperature- [{'source': 'E:\\Others\\document_assi

In [67]:
##Using LLM to generate
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain


#Loading the env file
load_dotenv()

True

In [68]:
llm = ChatGroq(model="llama3-8b-8192")

In [69]:
#Defining prompts for effective retrieval

system_prompts = (
    "You're an assistant who can retrieve data from the document. Use the given context to give the most appropriate answer to the query."
    "Keep the answer concise and to the point. If you don't know the answer, you can say 'I don't know'."
    "Dont add lines like 'according to the document' or 'as per the document'."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate(
    [
        ("system",system_prompts),
        ("human","{input}")
    ]
)

retriever = vector_store.as_retriever()
qa_chain = create_stuff_documents_chain(llm,prompt)
rag_chain = create_retrieval_chain(retriever,qa_chain)

In [70]:
response = rag_chain.invoke({"input": "What color variants does it have?"})
response["answer"]

'Dark Graphite and Grey.'

In [71]:
# Adding chat history

from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(llm,retriever,contextualize_q_prompt)


In [72]:
qa_prompt = ChatPromptTemplate(
    [
        ("system", system_prompts),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

qa_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, qa_chain)

In [75]:
from langchain_core.messages import AIMessage,HumanMessage

chat_history = []

question = "What are the color variants and tyre size?"
ai_msg_1 = rag_chain.invoke({"input":question,"chat_history":chat_history})

chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_1["answer"]),
    ]
)

second_question = "What is the biggest size tyre, and for which variant?"
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})

print(ai_msg_2["answer"])

The biggest size tyre is 285/45 R20, and it is for the Braga 20-inch and Nevada 20-inch variants.


In [76]:
#Stateful management of chat history
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}

def get_session_history(session_id):
    if session_id not in store:
        store[session_id]=ChatMessageHistory()
    return store[session_id]

conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key = "input",
    history_messages_key = "chat_history",
    output_messages_key = "answer"
)

In [79]:
conversational_rag_chain.invoke(
    {"input":"What is the part number for floor mats?"},
    config = {'configurable':{"session_id":"abc123"}}
)["answer"]

'Some part numbers for floor mats mentioned in the document are:\n\n* 5H1061270 WGK\n* 5H2061270 WGK\n* 5H1061445 WGK\n* 5G1061404 WGK\n* 5H0061512 82V\n* 5NL061279 WGK'

In [80]:
conversational_rag_chain.invoke(
    {"input":"Can we carry devices?"},
    config = {'configurable':{"session_id":"abc123"}}
)["answer"]

'Yes, according to the document, you can carry devices using the following accessories:\n\n* Tablet holder (Part number: 000.061.125.E or 000.061.125.N)\n* Hook for headset (Part number: 000.019.819.C)\n* Cradle for Mobile (Part number: 000.063.511.E)\n* USB extension (Part number: 000.051.435.AG)'

In [81]:
conversational_rag_chain.invoke(
    {"input":"What about ipads?"},
    config = {'configurable':{"session_id":"abc123"}}
)["answer"]

'According to the document, you can carry iPad devices using the following accessories:\n\n* Tablet holder for iPad 2-4 (Part number: 000.061.125.A)\n* Tablet holder for iPad Air (Part number: 000.061.125.E)'

In [83]:
conversational_rag_chain.invoke(
    {"input":"Is the car comfortable?"},
    config = {'configurable':{"session_id":"abc123"}}
)["answer"]

'According to the document, the Volkswagen Genuine roof box Comfort 340 and Comfort 460 mention an "optimised aerodynamic design that reduces unpleasant driving noises as much as possible" and a "3-point central locking mechanism", which suggests that the car is designed to provide a comfortable and secure driving experience.'

In [84]:
conversational_rag_chain.invoke(
    {"input":"What is the social media links for the Volkswagen?"},
    config = {'configurable':{"session_id":"abc123"}}
)["answer"]

'According to the document, the social media links for Volkswagen Middle East are:\n\n* YouTube: youtube.com/volkswagenmiddleeast\n* Facebook: facebook.com/VolkswagenME\n* Instagram: instagram.com/volkswagenme'

In [85]:
conversational_rag_chain.invoke(
    {"input":"Can you summarize the document for me?"},
    config = {'configurable':{"session_id":"abc123"}}
)["answer"]

'The document appears to be an accessories catalogue for Volkswagen passenger cars in the Middle East. It lists various accessories, including roof boxes, wheel holders, pedal caps, all-weather floor mats, and tablet holders. The document also mentions the official Volkswagen Middle East social media channels and website. Additionally, it provides information on the Volkswagen Genuine roof box Comfort 340 and Comfort 460, highlighting their features and benefits.'