<a href="https://colab.research.google.com/github/pradhap1125/Vector_db_sematic_search/blob/main/vector_db_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
pip install pdfminer.six numpy langchain sentence-transformers faiss-cpu




In [15]:
import os
from pdfminer.high_level import extract_text  # For extracting text from PDF files
import numpy as np  # For numerical operations and array handling
from langchain_text_splitters import RecursiveCharacterTextSplitter  # For splitting long text into chunks
from sentence_transformers import SentenceTransformer  # For embedding sentences into vectors
import faiss  # Facebook AI Similarity Search - for fast vector similarity search

In [16]:
# Load a pre-trained sentence transformer model for generating text embeddings
model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')

# A global list to keep track of the file names associated with each text chunk
filenames = []

# Function to extract raw text from a given PDF file
def extract_text_from_pdf(pdf_path):
    text = extract_text(pdf_path)
    return text

# Function to extract and split text from a PDF into smaller chunks
def load_data(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    # Split the text into smaller chunks to preserve semantic meaning
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=16, chunk_overlap=0)
    texts = text_splitter.split_text(text)
    return texts

# Function to create a FAISS index from a set of text embeddings
def create_faiss_index(embeddings):
    d = embeddings.shape[1]  # Dimensionality of embeddings
    index = faiss.IndexFlatIP(d)  # Using inner product similarity
    index.add(np.array(embeddings))  # Add embeddings to the index
    return index

# Function to perform a semantic search using FAISS and return matching filenames
def search_faiss_index(index, query_embedding, filenames, k=10):
    threshold = 0.7  # Similarity threshold for filtering results
    distances, indices = index.search(np.array(query_embedding), k)  # Top-k search
    filtered_results = [
        filenames[indices[0][i]]  # Get filenames with similarity >= threshold
        for i in range(len(indices[0]))
        if distances[0][i] >= threshold and indices[0][i] < len(filenames)
    ]
    return filtered_results

# Function to process all PDFs in a directory, generate embeddings, and store them in FAISS
def initial_vector_load(dir):
    all_texts = []
    global filenames

    for file in os.listdir(dir):
        if file.endswith(".pdf"):
            file_path = os.path.join(dir, file)
            texts = load_data(file_path)  # Extract and split text
            for text in texts:
                all_texts.append(text)
                filenames.append(file)  # Track which file each text chunk came from

    # Generate normalized embeddings for all text chunks
    embeddings = model.encode(all_texts, normalize_embeddings=True)
    embeddings = np.array(embeddings).astype('float32')

    if len(embeddings) == 0:
        return  # No data to index

    # Create and save FAISS index
    index = create_faiss_index(embeddings)
    faiss.write_index(index, "resume_index.faiss")

# Function to load the FAISS index and perform semantic search on it
def search_resume(keyword):
    index = faiss.read_index("resume_index.faiss")  # Load pre-built index
    query_embedding = model.encode([keyword], normalize_embeddings=True)
    query_embedding = np.array(query_embedding).astype('float32')
    results = search_faiss_index(index, query_embedding, filenames)
    return list(set(results))  # Return unique matching filenames

In [17]:
#TODO: Change the path of the directory as per your requirements
dir = "/content/vector_db_search/"  # Directory containing PDF files
#Load the vector db with the content of the files
initial_vector_load(dir)

In [18]:
#Search for filenames which contains the provided keywords
print(search_resume("Java"))

['Resume_Pradhap_Rajamani.pdf']
