In [None]:
import os
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS 

combined_text = ''

files_directory = 'files'

# Loop through all the files in the directory
for filename in os.listdir(files_directory):
    if filename.endswith('.pdf'):
        # Open the pdf file
        with pdfplumber.open(os.path.join(files_directory, filename)) as pdf:
            # Loop through all pages in the pdf file
            for page in pdf.pages:
                # Extract teh text from the page and add it to the rest of the text
                combined_text += page.extract_text()+ ' '

# print(combined_text)

# print(len(combined_text))
# Length 330573  

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
text_chunks = text_splitter.split_text(combined_text)

# print(len(text_chunks))

# Create an embeddings object using the HuggingFace model sentence-transformers/paraphrase-MiniLM-L6-v2
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2")

# Ingest the documents into Vector store
db = FAISS.from_texts(text_chunks, embeddings)

# Verify the index of the FAISS vector store to confirm it is populated
# print(db.index.ntotal)    

# Convert the FAISS vector store into a retriever that can return documents for a given query
# Instantiate the retriever by calling the as_retriever method on the vector store

retriever = db.as_retriever()
# Check the default top_k value set at langchain which defines how many chunks it will return
# print(retriever.search_kwargs)

# Test the retriever
# print(retriever.invoke("What is the summary of talk between CEOs of Capgemini and Schneider Electric on sustainability and digital transformation?"))



{}
