# Vector Utils

In [8]:
import numpy as np


def tokenize(text):
    """
    Split text into lowercase words. No fancy preprocessing.
    """
    return text.lower().split() #Takes the text and makes it all lower case and splits out each word


def build_vocab(docs):
    """
    Build vocabulary from list of docs.
    Returns a list of unique words.
    """
    unique_word_list = [] # empty list to store unique words
    staging_list = [] # staging list to add all words

    for doc in docs: # iterate through the document corpus
        int_list = (list(doc.split())) #take each sentence and form its own list
        for word in int_list: # take each word from the sentence
          staging_list.append(word.lower()) # add that word into the staging list and lower case to match our docs

    vocab = list(set(staging_list)) #find all the unique words from the staging list
    return vocab # returns a list of unique words from our corpus


def text_to_vector(text, vocab):
    """
    Convert text into a vector based on the vocab (Bag of Words).
    """
    tokenized_text = tokenize(text) # tokenize the input text
    vector_list = [] # empty list to store vectors
    for word in vocab: # vocab is our documents (corpus)
      count = tokenized_text.count(word) # count how many times each word occurs
      vector_list.append(count) # append the counts of each word into the list
    return vector_list #return the list.


def cosine_similarity(v1, v2):
    """
    Compute cosine similarity between two vectors.
    """
    a = np.array(v1) # convert into array
    b = np.array(v2)
    dot_product = np.dot(a, b) #find the dot product of the two vectors
    norm_a = np.linalg.norm(a) # normalize
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b) # compute cosine similarity

# Vector Store

In [32]:

class VectorStore:
    def __init__(self, documents):
        self.documents = documents
        self.vocab = build_vocab(documents) # this will create a list of all unique words from our documents
        self.doc_vectors = [text_to_vector(doc, self.vocab) for doc in documents] # create text to vector for each doc in documents

    def search(self, query, k=3):
        """
        Search for top-k documents similar to the query.
        """
        query_vector = text_to_vector(query, self.vocab) # vectorize the inout query by using our utils
        #print(f"The query vectors is: {query_vector}")
        similarity_score = [] # empoty list to store scores and index
        for doc_number, each_doc_vector in enumerate(self.doc_vectors): # get the doc number and doc vector
          score = cosine_similarity(query_vector, each_doc_vector) # create the cosine similarity sccore
          similarity_score.append((float(score), doc_number)) # add the score and doc number into list
        similarity_score.sort(key=lambda x:x[0], reverse=True)
        return similarity_score[:k] # return the top k score and doc number.



# Main

In [40]:
# docs below is our internal knowledge base, this could be anything of your choice.
docs = [
    "AI is the future",
    "I love programming in Python",
    "Artificial intelligence and machine learning are related fields",
    "Python is popular for data science",
    "AI is the most interesting pltform in the world"
]

my_vector_db = VectorStore(docs)
list_a = my_vector_db.search(query="Which language do I love?", k=2) # query is the user question, feel free to adjust this. K is the number or documents to retrieve
print(list_a)
best_results = list_a[0][1] # we only print the document with the highest matching result.
print(f"The closest matching document is: {docs[best_results]}")

I love programming in Python


# Our simple RAG system.

###Tokenize:
####This will take a piece of text and break it into individual tokens. For case sensitivity all tokens will be lowercase. For example "AI is the future" becomes ['ai','is','the','future',]

###build_vocab:
####This function will take a coprus of documents (docs) and create a list of unique words from the entire corpus. Additional info about each line can be found in the comments.

###text-to-vector:
####Take a string of text and convert it into vectors based on how many times the word occurs in the sentence.

###cosine_similarity:
####compute the cosine similarity of two vectors (user_question, knowledge_base_docs_vector)

###VecgorStore
####This is the final class that will use all the tools defined above. Here I take a user_question and perform the following steps

###tokenize -> text_to_vector -> cosine smilarity against our corpus -> print the top-k documents with the highest similarity score.
