In [None]:
!pip install faiss-cpu


In [None]:
import os
import warnings
from dotenv import load_dotenv

os.environ['KMP_DUPLICATE_LIB_OK'] = "True"
warnings.filterwarnings("ignore")

load_dotenv()


In [None]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader(r"rag-datset\gym supplements\1. Analysis of Actual Fitness Supplement.pdf")

loader.load()

In [None]:
pdfs = []
for root, dirs, files in os.walk("rag-datset"):
    for file in files:
        if file.endswith(".pdf"):
            pdfs.append(os.path.join(root, file))

In [None]:
pdfs

In [None]:
docs = []
for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    temp = loader.load()
    docs.extend(temp)

In [None]:
len(docs)

In [None]:
docs

In [None]:
docs[:4]

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
)
chunks = text_splitter.split_documents(docs)

In [None]:
len(chunks), len(docs), len(pdfs)

In [None]:
docs[0].metadata
print(docs[0].page_content)

In [None]:
from langchain_ollama.embeddings import OllamaEmbeddings
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore



In [None]:
embeddings = OllamaEmbeddings(model="nomic-embed-text",
                              base_url="http://localhost:11434"
)

vector = embeddings.embed_query(chunks[0].page_content)

In [None]:
vector

In [None]:
len(vector)

index= faiss.IndexFlatL2(len(vector))

In [None]:
index

In [None]:
vector_store = FAISS(embedding_function=embeddings,
                      index=index,
                      docstore=InMemoryDocstore(),
                      index_to_docstore_id={}
)


In [None]:
vector_store.index.ntotal

In [None]:
ids = vector_store.add_documents(documents = chunks)

In [None]:
len(ids), vector_store.index.ntotal

In [None]:
question = "How to gain muscle mass?"
result = vector_store.search(query=question, k=5, search_type="similarity")
result

In [None]:
db_name = "health_supplements"
vector_store.save_local(db_name)