In [1]:
import os
import ollama
from langchain.vectorstores import Chroma  # Use Chroma from here
from langchain.schema import Document  # Keep only one Document import
from langchain_community.embeddings import OllamaEmbeddings
from langchain.prompts import ChatMessagePromptTemplate, PromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [2]:
embeddings = 'nomic-embed-text'

In [3]:
embedModel = 'nomic-embed-text:v1.5'
llamaModel = 'llama3.1:8b'

In [4]:
sourceDirectory = "/home/xpert/Documents/llmRAG"

In [5]:
fileNames = [fileName for fileName in os.listdir(sourceDirectory) if fileName.endswith('pdf')]
pageList = []


In [6]:
for file in fileNames:
    filePath=os.path.join(sourceDirectory,file)
    loader=PyPDFLoader(file_path=filePath)
    pages=loader.load()
    pageList.extend(pages)

In [7]:
print(pageList)

[Document(metadata={'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 0}, page_content='ANNUAIRE DES ENTREPRISES RENCONTRE ES AU SIDO 2024  \n \nContact  Position  Téléphone  Email  Entreprise  Description  \nWilliam \nPEDICO  Account \nManager  +31 06 27 87 \n18 66  William_Pedico@apacer.nl  Apacer  SDD & RAM  \nPhilippe \nDUSSEL  Sales Engineer  06 60 15 19 \n69 philipe .dussel@ceista.fr  CEISTA  Prototypage et \nproduction de \ncartes \nélectroniques  \nAntoine \nERRIGO  CEO 04 74 32 23 \n60 direction@cadnov.fr  CadNov  Impression  3D, \nUsinage plastique, \nExpert en \nplasturgie . \nFernando \nAGUIAR  Business \nDevelopment \nManager  06 25 45 77 \n57 f.aguiar@utcoverseas.com  UTC Overseas  Transport de \nmarchandises \ninternational  \nElizabeth \nPATOUILLARD  Directrice / \nResponsable de \nla diffusion \ntechnologique  06 95 12 51 \n76 elisabeth.patou illard@cresitt.com  CRESIT T \nINDUSTRIE  Centre de \nrecherche \ntechnologique  sur \nla RF et la \nCompatibilité \nél

In [8]:
textSplitter=RecursiveCharacterTextSplitter(chunk_size=200,
                                              chunk_overlap=20,
                                              add_start_index=True)

In [9]:
textSplits=[]

In [10]:
textSplitsMetaData=[]
for page in pageList:
    split=textSplitter.split_text(page.page_content)
    textSplits.extend(split)
    PM=page.metadata
    for i in range(len(split)):
        textSplitsMetaData.append(PM)
        

In [11]:
print(textSplits)

['ANNUAIRE DES ENTREPRISES RENCONTRE ES AU SIDO 2024  \n \nContact  Position  Téléphone  Email  Entreprise  Description  \nWilliam \nPEDICO  Account \nManager  +31 06 27 87', '18 66  William_Pedico@apacer.nl  Apacer  SDD & RAM  \nPhilippe \nDUSSEL  Sales Engineer  06 60 15 19 \n69 philipe .dussel@ceista.fr  CEISTA  Prototypage et \nproduction de \ncartes \nélectroniques', 'électroniques  \nAntoine \nERRIGO  CEO 04 74 32 23 \n60 direction@cadnov.fr  CadNov  Impression  3D, \nUsinage plastique, \nExpert en \nplasturgie . \nFernando \nAGUIAR  Business \nDevelopment', 'Development \nManager  06 25 45 77 \n57 f.aguiar@utcoverseas.com  UTC Overseas  Transport de \nmarchandises \ninternational  \nElizabeth \nPATOUILLARD  Directrice / \nResponsable de \nla diffusion', 'la diffusion \ntechnologique  06 95 12 51 \n76 elisabeth.patou illard@cresitt.com  CRESIT T \nINDUSTRIE  Centre de \nrecherche \ntechnologique  sur \nla RF et la \nCompatibilité \nélectromagnétique  \nZiad', 'Ziad \nJABBOUR  Ph.

In [12]:
print(textSplitsMetaData)

[{'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 0}, {'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 0}, {'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 0}, {'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 0}, {'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 0}, {'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 0}, {'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 0}, {'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 0}, {'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 0}, {'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 0}, {'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 0}, {'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 1}, {'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 1}, {'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 1}, {'source': '/home/xpert/Documents/llmRAG/SIDO.pdf', 'page': 1}, {'source': '/home/xpert/Documents/llmRA

In [13]:
embeddings = []
for split in textSplits:
    embedding = ollama.embeddings(model=embedModel, prompt=split)
    embeddings.append(embedding)
    

In [14]:
print(embeddings)

[{'embedding': [-0.3432295024394989, -0.07387693226337433, -3.685731887817383, -0.13070131838321686, 0.7066499590873718, -0.6391118764877319, 0.11515036970376968, -0.22257986664772034, 0.037794679403305054, 0.4649003744125366, -1.5914840698242188, -0.26688042283058167, 0.7409917116165161, -0.689812421798706, 1.030023455619812, 0.13008755445480347, 0.26079291105270386, -0.7699636220932007, -1.097054123878479, -0.43133345246315, 0.7403022050857544, -0.7886638641357422, -1.2846992015838623, -0.4527320861816406, 1.472700834274292, 0.37839704751968384, 0.8027370572090149, 1.0732989311218262, -0.7519263625144958, -1.0848381519317627, 0.024762064218521118, 1.079535961151123, -0.32429975271224976, 0.7163403034210205, -0.509924054145813, -0.39987099170684814, -0.0776294469833374, 0.29066944122314453, -0.44589903950691223, 0.3378666639328003, 0.6046671271324158, -1.211567997932434, -0.36004334688186646, 0.16040225327014923, 0.8573025465011597, 0.042823076248168945, 0.43387556076049805, 1.0223194

In [15]:
print(len(embeddings))
len(textSplits)

55


55

In [16]:
DocumentObjectList = [Document(page_content=data[0],metadata=data[1]) for data in zip(textSplits,textSplitsMetaData)]

In [17]:
vectorDataBase = Chroma.from_documents(
    documents=DocumentObjectList,
    embedding=OllamaEmbeddings(model=embedModel, show_progress=True)
)


OllamaEmbeddings: 100%|██████████| 55/55 [00:05<00:00, 10.38it/s]


In [18]:
model = ChatOllama(model=llamaModel)

In [19]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five 
    different versions of the given user question to retrieve relevant documents from a vector 
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search. 
    Provide these alternative questions separated by newlines.
    Original question: {question}""",)

In [20]:
retriever = MultiQueryRetriever.from_llm(vectorDataBase.as_retriever(),
                                         ChatOllama(model=llamaModel),
                                         prompt=QUERY_PROMPT)    

In [21]:
templateRAG = """First try to answer the question based only on the following context:
{context} Question: {question} and if you cannot answer then use LLM knowledge to help"""

In [22]:
prompt = ChatPromptTemplate.from_template(templateRAG)

In [23]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
)

In [24]:
question = '''can you tell me about Yannick +ABGRALL?'''

In [25]:
response=chain.invoke(question)

OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.38s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 16.69it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  9.40it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  9.51it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 16.11it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  6.79it/s]


In [26]:
print(response.content)

Based on the provided context, I found a document with page metadata {'page': 2, 'source': '/home/xpert/Documents/llmRAG/SIDO.pdf'} that contains information about Yvan RIDE.

From the content of this page (page_content='94 yannick.ab grall@arts-energy.com ARTS ENERGY Batteries haute \nperformances \nYvan RIDE Business\nDévelopper 06 88 21 02 \n23 y.ride@cthings.co CTHINGS IoT Gateway\nJean -Baptiste'):

* Yvan RIDE's business is development.
* He is the Business Développer at ARTS ENERGY, which specializes in high-performance batteries.
* His contact number is 06 88 21 02 and his email address is y.ride@cthings.co.

Please let me know if you'd like more information!


In [27]:
with open('output.txt','w',encoding="utf-8") as text_file:
    text_file.write(response.content)