In [1]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load raw PDF

In [2]:
def load_files(data):
    loader = DirectoryLoader(data,
                             glob='*.pdf',
                             loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [3]:
DATA_PATH='data/'

In [4]:
documents=load_files(DATA_PATH)

In [5]:
len(documents)

759

# Create Text Chunks

In [6]:
def create_chunks(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,
                                                 chunk_overlap=50)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [7]:
text_chunks=create_chunks(documents)

In [8]:
len(text_chunks)

7079

# Create Vectore Embeddings

In [9]:
from langchain_huggingface import HuggingFaceEmbeddings

In [10]:
def get_embedding_model():
    embedding_model=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embedding_model

In [11]:
embedding_model=get_embedding_model()

  from .autonotebook import tqdm as notebook_tqdm


# Store Embeddings in FAISS

In [12]:
from langchain_community.vectorstores import FAISS

In [13]:
DB_FAISS_PATH='vectorstore/db_faiss'

In [14]:
db=FAISS.from_documents(text_chunks,embedding_model)

In [15]:
db.save_local(DB_FAISS_PATH)