In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


In [16]:
DATA_PATH = "data/"
DB_FAISS_PATH= "vectors/db/db_faiss"

In [17]:
# create the vector database
def create_vector_db():
    loader = DirectoryLoader(DATA_PATH, glob="*.pdf",loader_cls=PyPDFLoader)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    texts = text_splitter.split_documents(documents)

    # load the sentence transformer model
    embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2',
                                       model_kwargs= {'device':'cpu'})
    db = FAISS.from_document(texts, embeddings)
    db.save_local(DB_FAISS_PATH)

# 

