In [1]:
""" THIS PYTHON FILE CONTAINS THE CLASS FOR INDEXING EMBEDDINGS USING ANNOY"""

# importing the required libraries
import numpy as np
from annoy import AnnoyIndex
import os
from functools import reduce
import json



# building a class for storing and searching in the index
class AnnoyIndexer:

    """ A class to built and search an annoy index """

    def __init__(self):
        self.index=None


    
    def build_index(self):
       
        # Example configuration
        embedding_size = 384  # Example embedding size
        n_trees = 10  # Number of trees

        # Create a dummy embeddings dictionary
        # importing the embedding json file
        with open('../../artifacts/embeddings.json', 'r') as file:
            embeddings_dict = json.load(file)
                

        # Initialize Annoy index
        index = AnnoyIndex(embedding_size, 'euclidean')

        # Add items to the index
        for i, embedding in embeddings_dict.items():
            index.add_item(int(i), embedding)

        # Build the index
        try:
            index.build(n_trees)
            print("Successfully built the index")
        except Exception as e:
            print(f"Error building the index: {e}")

        # Save the index
        index.save('artifacts/TEST_annoy_index.ann')
        print("Index saved")

            
       

       


    def search(self, query_embedding:list, k:int, ids_lookup:dict=None)->list:
        """
        searches for the K nearest neighbors for the given query embedding

        query_embedding    : list of embedding of the queries we want to search
        k                  : no of nearest neightbor that we want to return
        ids_lookup         : a dictionary mapping where keys are article_ids and values are the embeddings
         
        returns a list of tuples of the form  [ (id,distance) , (id,distance) ]
        """

        # defining an empty list to store the results
        results = []

        for query in query_embedding:
            # getting the k nearest neightbor for the embedding
            result = self.index.get_nns_by_vector(query, k, include_distances=True)
            # result will be list of tuple of lists ( [a,b,c],[0.1,0.5,0.7] ) where a,b,c are indices(section_ids) and 0.1,0.5,0.7 are the similarity metrics
            # now we want to map the section_ids to article_ids
            ids =list( map( 
                           lambda x:ids_lookup[str(x)],
                            result[0]
                          ) )
            # sorting the result based on the metrics in descending order
            res = sorted( set(zip(ids,result[1])) ,lambda x:x[1], reverse=True)

            # appeding the nearest neighbors in the result list
            results.append(res)

        return results
    



    def save_index(self,index_path:str):
        """
        saves the index to the disk
        index_path: path where we want to store our index
        """
        self.index.save(index_path)
        logging.info(f"Index saved to {index_path}")

    


    def load(self,index_path:str):
        """
        loads the index from the disk
        index_path  : path of the index 
        """
        self.index = AnnoyIndex(config.EMBEDDING_SIZE, config.ANNOY_METRIC)
        self.index.load(index_path)
        logging.info(f"Index loaded from {index_path}")






# importing the embedding json file
with open('../artifacts/embeddings.json', 'r') as file:
        embeddings = json.load(file)

obj=AnnoyIndexer()
obj.build_index()
obj.save_index()

    





FileNotFoundError: [Errno 2] No such file or directory: '../../artifacts/embeddings.json'