In [1]:
from pdfminer.high_level import extract_text
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import h5py


  from tqdm.autonotebook import tqdm, trange


# Function to Load PDFs and Extract text from each page of PDFs

In [2]:

def load_pdfs():
    pdf_files = {"Alphabet Inc.": "goog-10-k-2023 (1).pdf", "Tesla, Inc.": "tsla-20231231-gen.pdf", "Uber Technologies, Inc.": "uber-10-k-2023.pdf"}
    pdf_texts = {}
    for name, path in pdf_files.items():
        text = extract_text(path)
        pdf_texts[name] = text
    return pdf_texts


# Function to Generating Embeddings by converting texts into High Dimensional Vectors

In [3]:

def generate_embeddings(texts, model):
    embeddings = {}
    for name, text in texts.items():
        embeddings[name] = model.encode([text])[0]
    return embeddings


# Function to Store Embeddings by using Faiss Indexing which helps in getting nearest neighbour while querying 

In [4]:

def store_embeddings(embeddings):
    dimension = len(next(iter(embeddings.values())))
    index = faiss.IndexFlatL2(dimension)
    vectors = []
    metadata = []
    for name, embedding in embeddings.items():
        vectors.append(embedding)
        metadata.append(name)
    vectors = np.array(vectors).astype('float32')
    index.add(vectors)
    return index, metadata


# Function to save a FAISS index and its associated metadata to an HDF5 file. HDF5 

In [5]:

def save_faiss_index(index, metadata, filename):
    with h5py.File(filename, 'w') as f:
        f.create_dataset('faiss_index', data=faiss.serialize_index(index))
        f.create_dataset('metadata', data=np.array(metadata, dtype='S'))


# Loading al-MiniLM-L6-v2 model for embedding 

In [6]:

# Load models
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')


In [7]:

# Load and process PDFs
pdf_texts = load_pdfs()
embeddings = generate_embeddings(pdf_texts, embedding_model)
index, metadata = store_embeddings(embeddings)


In [8]:

# Save FAISS index and metadata to HDF5 file
save_faiss_index(index, metadata, 'faiss_index.h5')
