# Build a RAG solution using Azure Cosmos DB Solution

## Install Dependencies

In [1]:
%pip install numpy
%pip install openai
%pip install python-dotenv
%pip install azure-core
%pip install azure-cosmos


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;4

## Load Azure configurations

You always need to run this!

In [5]:
from dotenv import load_dotenv
import os

load_dotenv() # take environment variables from .env.

azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_key = os.getenv("AZURE_OPENAI_KEY")
azure_openai_deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT")
azure_openai_embeddings_deployment = os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT")
azure_openai_api_version = "2024-10-01-preview"
azure_openai_embedding_size = 1536

azure_cosmosdb_endpoint = os.getenv("AZURE_COSMOSDB_ENDPOINT")
azure_cosmosdb_key = os.getenv("AZURE_COSMOSDB_KEY")
azure_cosmosdb_database = "recipes-database"
azure_cosmosdb_container = "recipes-container"

## Setup Azure Cosmos DB

In [6]:
from azure.cosmos import CosmosClient
from azure.cosmos import PartitionKey, exceptions

# Setup the connection
cosmos_client = CosmosClient(url=azure_cosmosdb_endpoint, credential=azure_cosmosdb_key)

# Create database
db = cosmos_client.create_database_if_not_exists(id=azure_cosmosdb_database)

# Author the vector embedding policy
vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path":"/contentVector",
            "dataType":"float32",
            "distanceFunction":"cosine",
            "dimensions":azure_openai_embedding_size
        }
    ]
}

full_text_policy = {
   "defaultLanguage": "en-US",
   "fullTextPaths": [
       {
           "path": "/name",
           "language": "en-US"
       },
       {
           "path": "/description",
           "language": "en-US"
       }
   ]
}

# Add vector indexes to indexing policy
indexing_policy = {
    "includedPaths": [
        {
            "path": "/*"
        }
    ],
    "excludedPaths": [
        {
            "path": "/\"_etag\"/?"
        },
        {
            "path": "/contentVector/*"
        }
    ],
    "fullTextIndexes": [
        {
            "path": "/name"
        },
        {
            "path": "/description"
        }
    ],
    "vectorIndexes": [
        {"path": "/contentVector",
         "type": "quantizedFlat"
        }
    ]
}

try:    
    container = db.create_container_if_not_exists(
                    id=azure_cosmosdb_container,
                    partition_key=PartitionKey(path='/id', kind='Hash'),
                    indexing_policy=indexing_policy,
                    vector_embedding_policy=vector_embedding_policy,
                    full_text_policy=full_text_policy)

    print('Container with id \'{0}\' created'.format(id))

except exceptions.CosmosResourceExistsError:
    print('A container with id \'{0}\' already exists'.format(id))

container = db.get_container_client(azure_cosmosdb_container)

Container with id '<built-in function id>' created


## Creating embeddings separately

We are computing the embeddings manually

In [7]:
from openai import AzureOpenAI
import json

# Azure OpenAI client
openai_client = AzureOpenAI(
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    azure_deployment=azure_openai_embeddings_deployment,
    api_key=azure_openai_key)

# Read the recipes.json
path = os.path.join('../Data/recipes/', 'recipes.json')
with open(path, 'r', encoding='utf-8') as file:
    recipes = json.load(file)

# Convert each recipe dictionary into a formatted string 
# And store these strings in a list
combined_strings = []
for recipe in recipes:
    combined_string = ""
    for key, value in recipe.items():
        if isinstance(value, list):
            combined_string += f"{key}:\n"
            for item in value:
                combined_string += f"  - {item}\n"
        else:
            combined_string += f"{key}: {value}\n"
    combined_strings.append(combined_string)

# Generate embeddings for each combined string
content_response = openai_client.embeddings.create(
    input=combined_strings, 
    model=azure_openai_embeddings_deployment, 
    dimensions=azure_openai_embedding_size
)

content_embeddings = [recipe.embedding for recipe in content_response.data]

# add contentVector field in recipes
for i, item in enumerate(recipes):
    item['contentVector'] = content_embeddings[i]

# Output embeddings to new json file
output_path = os.path.join('../Data/recipes/', 'recipesVectors.json')
output_directory = os.path.dirname(output_path)
if not os.path.exists(output_directory):
    os.makedirs(output_directory)
with open(output_path, "w") as f:
    json.dump(recipes, f)

## Upload data to the container

In [8]:
import json

with open('../Data/recipes/recipesVectors.json') as f:
   data = json.load(f)

container_client = db.get_container_client(azure_cosmosdb_container)

for item in data:
    print("writing item ", item['id'])
    container_client.upsert_item(item)

writing item  1
writing item  2
writing item  3
writing item  4
writing item  5
writing item  6
writing item  7
writing item  8
writing item  9
writing item  10
writing item  11
writing item  12
writing item  13


## Hybrid Search helper function

In [11]:
from openai import AzureOpenAI
from azure.cosmos import CosmosClient

# Simple function to assist with hybrid search
def hybrid_search(user_query, num_results):

    # Setup the connection
    cosmos_client = CosmosClient(url=azure_cosmosdb_endpoint, credential=azure_cosmosdb_key)
    database = cosmos_client.get_database_client(azure_cosmosdb_database)
    container = database.get_container_client(azure_cosmosdb_container)

    # Azure OpenAI client
    openai_client = AzureOpenAI(
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    azure_deployment=azure_openai_embeddings_deployment,
    api_key=azure_openai_key)

    # get embedding of user query
    response = openai_client.embeddings.create(input=user_query, 
                                               model=azure_openai_embeddings_deployment, 
                                               dimensions=azure_openai_embedding_size)
    embedding = response.data[0].embedding

    # format the query
    query ='''
                SELECT TOP {0} 
                    c.id, 
                    c.name,
                    c.description,
                    c.cuisine,
                    c.difficulty,
                    c.prepTime,
                    c.cookTime,
                    c.totalTime,
                    c.servings,
                    c.ingredients,
                    c.instructions, 
                    VectorDistance(c.contentVector,{1}) AS SimilarityScore 
                FROM c 
                ORDER BY RANK RRF 
                    (VectorDistance(c.contentVector, {1}), FullTextScore(c.description, ['{2}']))
            '''.format(num_results, embedding, user_query)
    
    results = container.query_items(
            query=query,
            enable_cross_partition_query=True)

    # Extract the necessary information from the results
    formatted_results = []
    for document in results:
        score = document.pop('SimilarityScore')
        formatted_result = {
            'SimilarityScore': score,
            'document': document
        }
        formatted_results.append(formatted_result)

    return formatted_results    
    

## Hybrid Query Search

In [15]:
query = "teas in recipe"
results = hybrid_search(query, 3)

for document in results:
        print(f"ID: {document['document']['id']}")
        print(f"Name: {document['document']['name']}")

ID: 4
Name: Chai Tea
ID: 5
Name: Irish Coffee
ID: 2
Name: Tiramisu


## Send query results to a language model to generate response

In [16]:
from openai import AzureOpenAI

# Azure OpenAI client
openai_client = AzureOpenAI(
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key)

# Provide instructions to the model
SYSTEM_PROMPT="""
You are an AI assistant that helps users learn from the information found in the source material.
Answer the query using only the sources provided below.
Use bullets if the answer has multiple points.
If the answer is longer than 3 sentences, provide a summary.
Answer ONLY with the facts listed in the list of sources below. Cite your source when you answer the question
If there isn't enough information below, say you don't know.
Do not generate answers that don't use the sources below.
Query: {query}
Sources:\n{sources}
"""

# User Query
query = "What of the recipes use sugar?"

results = hybrid_search(query, 5)

# Use a unique separator to make the sources distinct. 
# We chose repeated equal signs (=) followed by a newline because it's unlikely the source documents contain this sequence.
sources_formatted = "=================\n".join(
  [f'''Name: {document['document']['name']}, 
   Description: {document['document']['description']}, 
   Cuisine: {document['document']['cuisine']},
   Difficulty: {document['document']['difficulty']},
   Preparation Time: {document['document']['prepTime']},
   Cooking Time: {document['document']['cookTime']},
   Total Time: {document['document']['totalTime']},
   Servings: {document['document']['servings']},
   Ingredients: {document['document']['ingredients']},
   Instructions: {document['document']['instructions']}'''
   for document in results])

response = openai_client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": SYSTEM_PROMPT.format(query=query, sources=sources_formatted)
        }
    ],
    model=azure_openai_deployment
)

print(response.choices[0].message.content)


The recipes that use sugar are:

- **Irish Coffee**: Uses 1 tablespoon of brown sugar (Source: Irish Coffee).
- **Baklava**: Uses 1 cup of granulated sugar (Source: Baklava).
- **Cheesecake**: Uses 1 1/2 cups of granulated sugar (Source: Cheesecake).
- **Tiramisu**: Uses 3/4 cup of granulated sugar (Source: Tiramisu).
- **Margarita**: Uses 1 teaspoon of agave nectar or simple syrup (which contains sugar) (Source: Margarita). 

These recipes involve different types of sugar and sweeteners as part of their ingredients.
