In [1]:
import os
from dotenv import load_dotenv

from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
load_dotenv()


True

In [4]:
embeddings_model = OpenAIEmbeddings()
print(embeddings_model.model)

text-embedding-ada-002


In [5]:
pdf_files_to_process = ["1706.03762v7.pdf", "1801.06146v5.pdf", "2103.15348v2.pdf"]
all_docs = []

In [7]:
for pdf_path in pdf_files_to_process:
    if os.path.exists(pdf_path):
        loader = PyPDFLoader(pdf_path)
        # Each page of the PDF is loaded as a separate Document object
        all_docs.extend(loader.load())
        print(f"  - Loaded {pdf_path}")
    else:
        print(f"  -  Warning: {pdf_path} not found. Skipping.")

  - Loaded 1706.03762v7.pdf
  - Loaded 1801.06146v5.pdf
  - Loaded 2103.15348v2.pdf


In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500)
split_docs = text_splitter.split_documents(all_docs)

In [10]:
print(len(all_docs))
print(len(split_docs))

86
182


In [11]:
local_faiss_path = "faiss_index_langchain"


In [12]:
vector_store = FAISS.from_documents(split_docs, embeddings_model)


In [14]:
vector_store.save_local(local_faiss_path)

In [16]:
loaded_vector_store = FAISS.load_local(
    local_faiss_path,
    embeddings_model,
    allow_dangerous_deserialization=True
)
print("Vector store loaded successfully.")

# --- Perform a semantic search ---
query = "What is the transformer architecture?"
search_results = loaded_vector_store.similarity_search(query, k=3)



Vector store loaded successfully.


In [17]:
for i, doc in enumerate(search_results):
    print(f"\n--- Result {i+1} ---")
    # Display the source file for context
    source_file = os.path.basename(doc.metadata.get('source', 'Unknown'))
    print(f"Source: {source_file}")
    # Display the relevant content from the document chunk
    print(f"Content: {doc.page_content[:500]}...")


--- Result 1 ---
Source: 1706.03762v7.pdf
Content: Figure 1: The Transformer - model architecture.
The Transformer follows this overall architecture using stacked self-attention and point-wise, fully
connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,
respectively.
3.1 Encoder and Decoder Stacks
Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two
sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-
wise fully conn...

--- Result 2 ---
Source: 1706.03762v7.pdf
Content: Figure 1: The Transformer - model architecture.
The Transformer follows this overall architecture using stacked self-attention and point-wise, fully
connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,
respectively.
3.1 Encoder and Decoder Stacks
Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two
sub-layer