In [60]:
import numpy as np

class VectorStore:
    def __init__(self):
        self.vector_data = {}
        self.vector_index = {}

    def add_vector(self, vector_id, vector):
        self.vector_data[vector_id] = vector
        self._update_index(vector_id, vector)

    def cosine_similarity(self, vector, existing_vector):
        dot_product = np.dot(vector, existing_vector)
        norm_vector = np.linalg.norm(vector)
        norm_existing_vector = np.linalg.norm(existing_vector)
        similarity = dot_product / (norm_vector * norm_existing_vector)
        return similarity

    def _update_index(self, vector_id, vector):
        for existing_id, existing_vector in self.vector_data.items():
            similarity = self.cosine_similarity(vector, existing_vector)
            if vector_id not in self.vector_index:
                self.vector_index[vector_id] = {}
            if existing_id not in self.vector_index:
                self.vector_index[existing_id] = {}
            self.vector_index[vector_id][existing_id] = similarity
            self.vector_index[existing_id][vector_id] = similarity

    def find_similar_vectors(self, query_vector, num_results = 5):
        results = []
        try:
            for vector_id, vector in self.vector_data.items():
                results.append((vector_id, self.cosine_similarity(vector, query_vector)))

            results.sort(key = lambda x: x[1], reverse = True)
            return results[:num_results]
            
        except KeyError:
            return None
            
    def get_vector(self, vector_id):
        return (self.vector_data.get(vector_id), self.vector_index.get(vector_id).items())
            

In [61]:
# Establish a VectorStore instance
vector_store = VectorStore()

# Define sentences
sentences = [  # Defining a list of example sentences
    "I eat mango",
    "mango is my favorite fruit",
    "mango, apple, oranges are fruits",
    "fruits are good for health",
]

# Tokenization and Vocabulary Creation
vocabulary = set()
for sentence in sentences:
    tokens = sentence.lower().split(" ")
    vocabulary.update(tokens)

# Assign unique indices to vocabulary words
word_to_index = {word: i for i, word in enumerate(vocabulary)}

# Vectorization
sentence_vectors = {}
for sentence in sentences:
    tokens = sentence.lower().split(" ")
    vector = np.zeros(len(vocabulary))
    for token in tokens:
        vector[word_to_index[token]] += 1
    sentence_vectors[sentence] = vector

# Store in VectorStore
for sentence, vector in sentence_vectors.items():  # Iterating over each sentence vector
    vector_store.add_vector(sentence, vector) 

In [62]:
# Similarity Search
query_sentence = "Mango is the best fruit"  
query_vector = np.zeros(len(vocabulary))  
query_tokens = query_sentence.lower().split()  
for token in query_tokens:  
    if token in word_to_index:  
        query_vector[word_to_index[token]] += 1  

similar_sentences = vector_store.find_similar_vectors(query_vector, num_results=2)  # Finding similar sentences

# Display similar sentences
print("Query Sentence:", query_sentence)  
print("Similar Sentences:")  
for sentence, similarity in similar_sentences:  
    print(f"{sentence}: Similarity = {similarity:.4f}") 

Query Sentence: Mango is the best fruit
Similar Sentences:
mango is my favorite fruit: Similarity = 0.7746
I eat mango: Similarity = 0.3333


In [63]:
vector_store.get_vector("I eat mango")

(array([1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]),
 dict_items([('I eat mango', 1.0000000000000002), ('mango is my favorite fruit', 0.2581988897471611), ('mango, apple, oranges are fruits', 0.0), ('fruits are good for health', 0.0)]))

Inspired by - https://medium.com/@vidiptvashist/building-a-vector-database-from-scratch-in-python-6bd683ba5171