In [1]:
#pip install faiss-cpu

In [2]:
import os
import warnings
from dotenv import load_dotenv

#If you get duplicate error run---

#os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
#warnings.filterwarnings("ignore")
# ----


env_path = "../OllamaSetup/.env"  
load_dotenv(env_path)

True

## Document Loader

In [3]:
from langchain_community.document_loaders import PyMuPDFLoader

In [4]:
## Read the list of pdfs in directory

pdfs=[]

for root, dirs, files in os.walk("rag-dataset"):
    #print(root,dirs,files)
    for file in files:
        if file.endswith(".pdf"):
            pdfs.append(os.path.join(root,file))

In [5]:
pdfs

['rag-dataset/health supplements/1. dietary supplements - for whom.pdf',
 'rag-dataset/health supplements/3.health_supplements_side_effects.pdf',
 'rag-dataset/health supplements/2. Nutraceuticals research.pdf',
 'rag-dataset/gym supplements/2. High Prevalence of Supplement Intake.pdf',
 'rag-dataset/gym supplements/1. Analysis of Actual Fitness Supplement.pdf']

In [6]:
docs=[]
for pdf in pdfs:
    loader= PyMuPDFLoader(pdf)
    temp= loader.load()
    docs.extend(temp)



In [7]:
len(docs)

64

## Create Document Chunks

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(chunk_size= 1000, chunk_overlap= 100)
chunks= text_splitter.split_documents(docs)

In [9]:
len(chunks)

311

In [10]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o-mini")
len(encoding.encode(chunks[0].page_content))

271

## Document Vector Embedding

In [11]:
from langchain_ollama import OllamaEmbeddings
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [12]:
embeddings = OllamaEmbeddings(model= 'nomic-embed-text:latest',base_url='http://localhost:11434')

In [13]:
vector = embeddings.embed_query("hello world")
index = faiss.IndexFlatL2(len(vector))
index.ntotal,index.d

(0, 768)

In [14]:



vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [15]:
vector_store.index.ntotal , vector_store.index.d

(0, 768)

In [18]:
ids =vector_store.add_documents(documents=chunks)

  ids =vector_store.add_documents(documents=chunks)


In [19]:
len(ids),vector_store.index.ntotal

(311, 311)

## Retrieval

In [20]:
question = "how to gain muscle mass"
docs= vector_store.search(question,k=5, search_type="similarity")
docs

[Document(id='e143abd7-93de-4cae-a226-ecaaae84e469', metadata={'producer': 'iLovePDF', 'creator': '', 'creationdate': '', 'source': 'rag-dataset/gym supplements/2. High Prevalence of Supplement Intake.pdf', 'file_path': 'rag-dataset/gym supplements/2. High Prevalence of Supplement Intake.pdf', 'total_pages': 11, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-10-21T11:39:04+00:00', 'trapped': '', 'modDate': 'D:20241021113904Z', 'creationDate': '', 'page': 8}, page_content='and strength gain among men. We detected more prevalent protein and creatine supplementation\namong younger compared to older ﬁtness center users, whereas the opposite was found for vitamin\nsupplementation. Other authors made similar observations [23] and it might reﬂect the diﬀerent\ntraining goals among age groups, i.e., more focus on strength and muscles among the younger and\nmore focus on health among the older age groups.\nComparable to other studies [4], we dete

In [22]:
db_name= "health_supplements"
vector_store.save_local(db_name)