# 2. Building the vector database
- we load our previously created collections from the database
- we use a pipeline to get the raw profile data as well as a unique identifier for each profile
- we split the profiles into chunks
- we embed the chunks
- we store the embeddings in a vector database
- we can now query our vector database for semantic similarity

### Prerequisites

In [1]:
import json
from pymongo import MongoClient

In [2]:
# Initialize the client
client = MongoClient()

# The database on the client we're connecting to
db = client['zhaw_matchmaking']

# The collections in the database we are using
profile_data_collection = db['profile_data']
persons_collection = db["persons"]

### Use a pipeline to retrieve the profile data and the unique `shorthandSymbol` for each user
- we access both collections
- we use a generator for memory efficiency

In [3]:
# Our aggregation pipeline
pipeline = [
    {
        "$lookup": {
            "from": "persons",
            "localField": "person_id",
            "foreignField": "_id",
            "as": "person_data"
        }
    },
    {
        "$unwind": "$person_data"
    },
    {
        "$project": {
            "raw_data": 1,
            "shorthandSymbol": "$person_data.shorthandSymbol"
        }
    }
]

#### Define a generator function

In [4]:
def generate_profiles():
    for profile in profile_data_collection.aggregate(pipeline):
        yield profile

## Preprocess the profile data

In [5]:
import re

In [6]:
def preprocess_profile(raw_data: str):
    # Remove all obsolete whitespace
    cleaned_str = re.sub(r'\s+', ' ', raw_data)

    # Remove the final "Zurück" in every profile
    cleaned_str = cleaned_str.replace('Zurück', '')

    # Insert whitespace between numbers and letters (when they got lost from the original html)
    pattern = r'([a-zA-Z])(\d)'
    cleaned_str = re.sub(pattern, r'\1 \2', cleaned_str)

    return cleaned_str

## Load the profiles as documents for LangChain

In [7]:
from langchain.document_loaders.base import BaseLoader
from langchain.schema import Document

### Define a custom loader

In [8]:
class CustomLoader(BaseLoader):
    def __init__(self, generate_profiles):
        self.profiles = generate_profiles

    def lazy_load(self):
        for profile in self.profiles:
            metadata = {"source": profile["shorthandSymbol"]}
            page_content = preprocess_profile(profile["raw_data"])
            yield Document(page_content=page_content, metadata=metadata)

    def load(self):
        return list(self.lazy_load())

### Instantiate the custom loader and load the profiles as documents
- we use the custom loader to load our profile data
- each profile will be a `Document`

In [9]:
loader = CustomLoader(generate_profiles())
loaded_documents = loader.load()

## Chunking
- as our profile data might be too large as context for a prompt, we need to split each profile (loaded document) into smaller chunks
- no overlap necessary
- `RecursiveCharacterTextSplitter` is recommended for generic text
- each loaded document will be split into chunks, these chunks will be held in `list_of_document_chunks`

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [12]:
# Instantiate text splitter with chunk size of 500 characters
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500)

In [13]:
list_of_document_chunks = text_splitter.split_documents(loaded_documents)

In [14]:
print(len(list_of_document_chunks))

57834


## Create embeddings from the generated chunks and add to our Vector database
- **make sure to place your OpenAI API key in `.env`**

In [3]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key = os.environ['OPENAI_API_KEY']

In [4]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embedding_function = OpenAIEmbeddings(show_progress_bar=True)

### Create a `ChromaDB` from our generated list of document chunks and save it to disk
- this will take a while, the final database is around `1GB`
- to avoid getting a `RateLimitError`, we use the `tenacity` library to retry the call after an exponential delay
- we also call a custom function to fill the database with splits of the generated chunks (Chroma can handle max. 41000 at once)

In [20]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

In [21]:
@retry(wait=wait_random_exponential(min=20, max=120), stop=stop_after_attempt(20))
def create_vectordb(split_chunks):
    vectordb = Chroma.from_documents(
        documents=split_chunks,
        embedding=embedding_function,
        persist_directory="./chroma_db")
    vectordb.persist()

In [22]:
def split_list_of_document_chunks(input_list, split_size):
    for i in range(0, len(input_list), split_size):
        yield input_list[i:i + split_size]

In [23]:
split_document_chunks = split_list_of_document_chunks(list_of_document_chunks, 41000)

In [None]:
for split_chunk in split_document_chunks:
    create_vectordb(split_chunk)

## Query our vector database
- we use similarity search here
- number of retrieved documents `k = 4` (default)
- continued in *part 3*

In [5]:
# Load database from disk
loaded_vectordb = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)

In [6]:
query = "Wer arbeitet an Blockchain-Technologie?"

In [7]:
results = loaded_vectordb.similarity_search(query, k=4)

  0%|          | 0/1 [00:00<?, ?it/s]

#### We retrieve semantically similar chunks, from different sources

In [None]:
# Content of a single retrieved chunk
print(results[0].page_content)

In [None]:
# Source of a single retrieved chunk
print(results[0].metadata)

In [9]:
# Number of different sources retrieved
sources = set()
for result in results:
    sources.add(result.metadata["source"])

print(len(sources))

2
