In [1]:
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
import logging
import os
import time
from typing import List, Tuple, Callable, Any, Dict, Optional
import psutil

  from .autonotebook import tqdm as notebook_tqdm


In [49]:
sentences = [
    "Modern machine learning involves the use of sophisticated algorithms and computational models to enable computers to learn from data and make predictions or decisions without being explicitly programmed.",
    "Deep learning, a subset of machine learning, has gained prominence in recent years, utilizing neural networks with multiple layers to process complex data and extract meaningful patterns.",
    "One of the key advancements in modern machine learning is the availability of large amounts of data, which enables more accurate and robust models.",
    "The use of GPUs (Graphics Processing Units) has revolutionized machine learning by significantly accelerating the training and inference processes, especially for deep learning algorithms.",
    "Transfer learning has emerged as a powerful technique, allowing models to leverage knowledge gained from pre-training on large datasets and apply it to specific tasks with limited labeled data.",
    "Reinforcement learning, an area of machine learning, focuses on training agents to make sequential decisions by interacting with an environment and receiving feedback in the form of rewards.",
    "Explainable AI (Artificial Intelligence) is becoming increasingly important in modern machine learning, as it aims to provide understandable and interpretable explanations for the decisions made by AI systems.",
    "AutoML (Automated Machine Learning) has gained popularity, aiming to automate the process of selecting and optimizing machine learning models, making it more accessible to non-experts.",
    "Federated learning has emerged as a privacy-preserving approach in machine learning, enabling multiple devices or parties to collaboratively train models without sharing their raw data.",
    "Modern machine learning techniques have found applications in various domains, including image and speech recognition, natural language processing, autonomous vehicles, healthcare, finance, and many others.",
    "Machine learning algorithms can be broadly categorized into supervised learning, unsupervised learning, and reinforcement learning, each with its unique characteristics and applications.",
    "Supervised learning involves training a model on labeled data, where the desired output is known, allowing the model to learn patterns and make predictions on new, unseen data.",
    "Unsupervised learning focuses on discovering patterns and structures in unlabeled data, without specific target outputs, and is often used for tasks like clustering and dimensionality reduction.",
    "Reinforcement learning relies on an agent interacting with an environment, learning through trial and error to maximize cumulative rewards, and has shown great potential in areas like robotics and game playing.",
    "One of the challenges in modern machine learning is overfitting, where a model performs well on training data but fails to generalize to new, unseen data due to excessive complexity or noise in the training set.",
    "Regularization techniques, such as L1 and L2 regularization, are commonly used to mitigate overfitting by adding penalty terms to the model's loss function, discouraging overly complex solutions.",
    "Feature engineering plays a crucial role in machine learning, involving the selection, extraction, and transformation of relevant features from raw data to enhance a model's predictive performance.",
    "With the advent of deep learning, feature learning has become more automated, allowing neural networks to automatically learn hierarchical representations from raw data, reducing the need for manual feature engineering.",
    "Convolutional Neural Networks (CNNs) have revolutionized image recognition tasks, utilizing convolutional layers to capture local patterns and hierarchical structures in images, achieving state-of-the-art performance.",
    "Recurrent Neural Networks (RNNs) are widely used in natural language processing tasks, capable of capturing sequential dependencies and long-term contextual information, making them effective for tasks like language translation and sentiment analysis.",
    "Generative Adversarial Networks (GANs) have gained attention for their ability to generate realistic synthetic data by training a generator network to compete against a discriminator network, resulting in a creative and data-driven approach.",
    "Machine learning models are typically evaluated using metrics such as accuracy, precision, recall, F1 score, and area under the curve (AUC), providing quantitative measures of their performance on specific tasks.",
    "Cross-validation is a commonly used technique to assess a model's generalization ability by splitting the data into multiple subsets for training and evaluation, helping to estimate its performance on unseen data.",
    "The bias-variance tradeoff is a fundamental concept in machine learning, balancing the model's ability to fit the training data well (low bias) while avoiding overfitting (low variance) to improve generalization.",
    "Ensemble learning methods, such as random forests and boosting, combine multiple base models to improve predictive performance and reduce the risk of overfitting, resulting in more robust and accurate models.",
    "Hyperparameter tuning involves optimizing the settings or configurations of a machine learning model, such as learning rate, regularization strength, and network architecture, to achieve better performance.",
    "Grid search and random search are common approaches for hyperparameter tuning, systematically exploring the hyperparameter space to find the optimal combination that maximizes the model's performance.",
    "Model selection is a critical step in machine learning, where different algorithms or models are compared and evaluated to identify the most suitable one for a specific task based on their performance and complexity.",
    "The No Free Lunch (NFL) theorem states that no machine learning algorithm is universally superior to all others across all possible problems, emphasizing the importance of selecting the right algorithm for a given task.",
    "The curse of dimensionality refers to the challenges that arise when working with high-dimensional data, as the data becomes sparser, and the risk of overfitting increases, requiring careful feature selection and dimensionality reduction techniques.",
    "Deep learning models often require large amounts of labeled data for training, which can be a bottleneck in domains where labeled data is scarce or expensive to acquire, leading to the exploration of semi-supervised and unsupervised learning methods.",
    "The availability of open-source machine learning frameworks and libraries, such as TensorFlow, PyTorch, and scikit-learn, has significantly contributed to the widespread adoption and accessibility of modern machine learning techniques.",
    "The field of Explainable AI (XAI) aims to address the black-box nature of some machine learning models by providing interpretable explanations and insights into how the models make their predictions or decisions.",
    "Interpretability techniques, such as feature importance analysis, saliency maps, and attention mechanisms, help users understand the underlying reasoning and factors that influence the model's outputs.",
    "Ethical considerations in machine learning have gained attention, focusing on issues like bias and fairness, privacy, accountability, and transparency to ensure responsible and trustworthy deployment of AI systems.",
    "Bias in machine learning can arise from biased training data, leading to discriminatory outcomes and reinforcing existing societal biases, highlighting the need for diverse and representative training datasets.",
    "Fairness-aware machine learning algorithms aim to mitigate bias and ensure fair treatment across different demographic groups, promoting fairness and equality in decision-making processes.",
    "Privacy concerns arise when dealing with sensitive data in machine learning, leading to the development of privacy-preserving techniques, such as differential privacy and federated learning, to protect individuals' data while still enabling model training.",
    "Adversarial attacks pose a threat to machine learning models, where malicious actors manipulate input data to deceive the model and cause incorrect predictions, driving the development of adversarial defense mechanisms.",
    "The field of reinforcement learning has seen remarkable advancements, with algorithms like Deep Q-Networks (DQN) and Proximal Policy Optimization (PPO) achieving human-level performance in challenging tasks like playing complex games.",
    "Robotics has benefited from machine learning techniques, enabling robots to learn from data and adapt their behaviors to interact with and navigate the physical world, leading to advancements in autonomous vehicles and robotic automation.",
    "Machine learning has found applications in healthcare, including disease diagnosis, personalized treatment recommendation, drug discovery, and medical imaging analysis, aiding clinicians and improving patient outcomes.",
    "In finance, machine learning models are used for tasks like fraud detection, credit scoring, algorithmic trading, and risk assessment, leveraging vast amounts of financial data to make accurate predictions and inform decision-making.",
    "Natural language processing (NLP) has made significant strides, with models like BERT and GPT achieving state-of-the-art performance in tasks such as sentiment analysis, language translation, and question answering.",
    "Machine learning is playing a crucial role in environmental sciences, helping analyze climate data, predict natural disasters, monitor wildlife, and support sustainability efforts through applications like precision agriculture.",
    "The interpretability of machine learning models has become a regulatory requirement in some domains, such as healthcare, where explainable and transparent AI systems are necessary to ensure patient safety and regulatory compliance.",
    "The field of machine learning continues to evolve rapidly, with ongoing research in areas like meta-learning, few-shot learning, lifelong learning, and continual learning, aiming to improve the capabilities and flexibility of AI systems.",
    "As machine learning models become more complex and powerful, there is an increasing need for ethical guidelines, regulations, and frameworks to govern their development, deployment, and impact on society.",
    "The responsible use of machine learning requires interdisciplinary collaboration, involving not only computer scientists and engineers but also experts in fields like ethics, law, sociology, and psychology.",
    "Machine learning has the potential to transform industries and societies, driving innovation, improving efficiency, and addressing complex challenges, but it also requires careful consideration of its limitations and societal impact.",
    "As advancements in hardware, algorithms, and data availability continue, the future of machine learning holds great promise, with the potential to tackle increasingly complex problems and unlock new possibilities across various domains.",
    "Modern machine learning is an ever-evolving field, continuously pushing the boundaries of what machines can learn and achieve, and its impact on society will continue to grow in the years to come."
]

In [50]:
len(sentences)

52

In [60]:
class ScalableSemanticSearch:
    """Vector similarity using product quantization with sentence transformers embeddings and cosine similarity."""

    def __init__(self, device="cpu"):
        self.device = device
        self.model = SentenceTransformer(
            "sentence-transformers/all-mpnet-base-v2", device=self.device
        )
        # self.model = SentenceTransformer('bert-base-nli-mean-tokens',  device=self.device)
        self.dimension = self.model.get_sentence_embedding_dimension()
        self.quantizer = None
        self.index = None
        self.hashmap_index_sentence = None

        log_directory = "log"
        if not os.path.exists(log_directory):
            os.makedirs(log_directory)
        log_file_path = os.path.join(log_directory, "scalable_semantic_search.log")

        logging.basicConfig(
            filename=log_file_path,
            level=logging.INFO,
            format="%(asctime)s %(levelname)s: %(message)s",
        )
        logging.info("ScalableSemanticSearch initialized with device: %s", self.device)

    @staticmethod
    def calculate_clusters(n_data_points: int) -> int:
        return max(2, min(n_data_points, int(np.sqrt(n_data_points))))

    def encode(self, data: List[str]) -> np.ndarray:
        """Encode input data using sentence transformer model.

        Args:
            data: List of input sentences.

        Returns:
            Numpy array of encoded sentences.
        """
        embeddings = self.model.encode(data)
        self.hashmap_index_sentence = self.index_to_sentence_map(data)
        return embeddings.astype("float32")

    def build_index(self, embeddings: np.ndarray) -> None:
        """Build the index for FAISS search.

        Args:
            embeddings: Numpy array of encoded sentences.
        """
        n_data_points = len(embeddings)
        if (
            n_data_points >= 500
        ):  # Adjust this value based on the minimum number of data points required for IndexIVFPQ
            self.quantizer = faiss.IndexFlatL2(self.dimension)
            n_clusters = self.calculate_clusters(n_data_points)
            self.index = faiss.IndexIVFPQ(
                self.quantizer, self.dimension, n_clusters, 8, 4
            )
            logging.info("IndexIVFPQ created with %d clusters", n_clusters)
        else:
            self.index = faiss.IndexFlatL2(self.dimension)
            logging.info("IndexFlatL2 created")

        if isinstance(self.index, faiss.IndexIVFPQ):
            self.index.train(embeddings)
        self.index.add(embeddings)
        logging.info("Index built on device: %s", self.device)

    @staticmethod
    def index_to_sentence_map(data: List[str]) -> Dict[int, str]:
        """Create a mapping between index and sentence.

        Args:
            data: List of sentences.

        Returns:
            Dictionary mapping index to the corresponding sentence.
        """
        return {index: sentence for index, sentence in enumerate(data)}

    @staticmethod
    def get_top_sentences(
        index_map: Dict[int, str], top_indices: np.ndarray
    ) -> List[str]:
        """Get the top sentences based on the indices.

        Args:
            index_map: Dictionary mapping index to the corresponding sentence.
            top_indices: Numpy array of top indices.

        Returns:
            List of top sentences.
        """
        return [index_map[i] for i in top_indices]

    def search(self, input_sentence: str, top: int) -> Tuple[np.ndarray, np.ndarray]:
        """Compute cosine similarity between an input sentence and a collection of sentence embeddings.

        Args:
            input_sentence: The input sentence to compute similarity against.
            top: The number of results to return.

        Returns:
            A tuple containing two numpy arrays. The first array contains the cosine similarities between the input
            sentence and the embeddings, ordered in descending order. The second array contains the indices of the
            corresponding embeddings in the original array, also ordered by descending similarity.
        """
        vectorized_input = self.model.encode(
            [input_sentence], device=self.device
        ).astype("float32")
        D, I = self.index.search(vectorized_input, top)
        return I[0], 1 - D[0]

    def save_index(self, file_path: str) -> None:
        """Save the FAISS index to disk.

        Args:
            file_path: The path where the index will be saved.
        """
        if hasattr(self, "index"):
            faiss.write_index(self.index, file_path)
        else:
            raise AttributeError(
                "The index has not been built yet. Build the index using `build_index` method first."
            )

    def load_index(self, file_path: str) -> None:
        """Load a previously saved FAISS index from disk.

        Args:
            file_path: The path where the index is stored.
        """
        if os.path.exists(file_path):
            self.index = faiss.read_index(file_path)
        else:
            raise FileNotFoundError(f"The specified file '{file_path}' does not exist.")

    @staticmethod
    def measure_time(func: Callable, *args, **kwargs) -> Tuple[float, Any]:
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        elapsed_time = end_time - start_time
        return elapsed_time, result

    @staticmethod
    def measure_memory_usage() -> float:
        process = psutil.Process(os.getpid())
        ram = process.memory_info().rss
        return ram / (1024**2)

    def timed_train(self, data: List[str]) -> Tuple[float, float]:
        start_time = time.time()
        embeddings = self.encode(data)
        self.build_index(embeddings)
        end_time = time.time()
        elapsed_time = end_time - start_time
        memory_usage = self.measure_memory_usage()
        logging.info(
            "Training time: %.2f seconds on device: %s", elapsed_time, self.device
        )
        logging.info("Training memory usage: %.2f MB", memory_usage)
        return elapsed_time, memory_usage

    def timed_infer(self, query: str, top: int) -> Tuple[float, float]:
        start_time = time.time()
        _, _ = self.search(query, top)
        end_time = time.time()
        elapsed_time = end_time - start_time
        memory_usage = self.measure_memory_usage()
        logging.info(
            "Inference time: %.2f seconds on device: %s", elapsed_time, self.device
        )
        logging.info("Inference memory usage: %.2f MB", memory_usage)
        return elapsed_time, memory_usage

    def timed_load_index(self, file_path: str) -> float:
        start_time = time.time()
        self.load_index(file_path)
        end_time = time.time()
        elapsed_time = end_time - start_time
        logging.info(
            "Index loading time: %.2f seconds on device: %s", elapsed_time, self.device
        )
        return elapsed_time

In [53]:
semantic_search = ScalableSemanticSearch(device="cpu")

Exception when trying to download https://sbert.net/models/sentence-transformers/all-mpnet-base-v2.zip. Response 404


In [54]:
embeddings = semantic_search.encode(sentences)

In [55]:
embeddings.shape

(52, 768)

In [None]:
embeddings

In [56]:
semantic_search.build_index(embeddings)

In [57]:
query = "Explainable AI (Artificial Intelligence) is becoming increasingly important in modern machi"
top = 3
top_indices, top_scores = semantic_search.search(query, top)

In [58]:
top_indices

array([ 6, 32, 45])

In [59]:
top_sentences = ScalableSemanticSearch.get_top_sentences(semantic_search.hashmap_index_sentence, top_indices)

In [17]:
top_sentences

['Explainable AI (Artificial Intelligence) is becoming increasingly important in modern machine learning, as it aims to provide understandable and interpretable explanations for the decisions made by AI systems.',
 'Modern machine learning involves the use of sophisticated algorithms and computational models to enable computers to learn from data and make predictions or decisions without being explicitly programmed.',
 'Deep learning, a subset of machine learning, has gained prominence in recent years, utilizing neural networks with multiple layers to process complex data and extract meaningful patterns.']