In [14]:
import os
import json

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
from azure.cosmos import CosmosClient, exceptions, PartitionKey

URL = os.environ['AZURE_COSMOSDB_ENDPOINT']
KEY = os.environ['AZURE_COSMOSDB_KEY']
client = CosmosClient(URL, credential=KEY)
DATABASE_NAME = 'coffeeDatabase'
CONTAINER_NAME = 'menuContainer'
database = client.get_database_client(DATABASE_NAME)
container = database.get_container_client(CONTAINER_NAME)

# Query the data  
query = "SELECT * FROM c"  
items = list(container.query_items(  
    query=query,  
    enable_cross_partition_query=True  
))  

In [3]:
print(len(items), "\n")

items[0]

79 



{'id': 'c001',
 'item': 'Espresso',
 'price': 2.5,
 'description': 'Rich, full-bodied coffee, with a compact crema and strong aroma.',
 'category': 'Coffees',
 '_rid': 'uBp0AIaQaQIBAAAAAAAAAA==',
 '_self': 'dbs/uBp0AA==/colls/uBp0AIaQaQI=/docs/uBp0AIaQaQIBAAAAAAAAAA==/',
 '_etag': '"5e004bb8-0000-0500-0000-663ea6180000"',
 '_attachments': 'attachments/',
 '_ts': 1715381784}

In [4]:
# Remove the keys  
keys_to_remove = ['_rid', '_self', '_etag', '_attachments', '_ts']
for item in items: 
    for key in keys_to_remove:  
        if key in item:  
            del item[key]  
items[0]

{'id': 'c001',
 'item': 'Espresso',
 'price': 2.5,
 'description': 'Rich, full-bodied coffee, with a compact crema and strong aroma.',
 'category': 'Coffees'}

In [5]:
for item in items:
    item['content'] = f""" 
        Item Name: {item['item']}
        {item['item']} details:
            Price: {item['price']}$
            Category: {item['category']}
            Description: {item['description']}
    """
    item["metadata"] = json.dumps({'source': 'Contoso menu from Cosmos DB'})
    
print(items[0]['content'])

 
        Item Name: Espresso
        Espresso details:
            Price: 2.5$
            Category: Coffees
            Description: Rich, full-bodied coffee, with a compact crema and strong aroma.
    


In [9]:
from langchain.schema.document import Document
from openai import AzureOpenAI


client = AzureOpenAI(
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version = "2024-02-01",
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
)

# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text):
    response = client.embeddings.create(input = [text], model="text-embedding-3-large")
    embeddings = response.data[0].embedding
    return embeddings

def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-large')
    if type(texts[0]) == str:
        total_tokens = sum([len(enc.encode(page)) for page in texts])
    elif type(texts[0]) == Document: 
        total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00013:.6f}')
    
texts = [item_dict['content'] for item_dict in items]
print_embedding_cost(texts)

Total Tokens: 3591
Embedding Cost in USD: 0.000467


In [12]:
embeddings = generate_embeddings("this is a sample text")
len(embeddings)

3072

In [13]:
for item in items:
    item["content_vector"] = generate_embeddings(item["content"])

In [15]:
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential

from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex
)

# The following variables from your .env file are used in this notebook
endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"]) if len(os.environ["AZURE_SEARCH_ADMIN_KEY"]) > 0 else DefaultAzureCredential()
index_name = "latest-coffee-index"

# Create a search index
index_client = SearchIndexClient(
    endpoint=endpoint, credential=credential)

In [18]:

fields=[  
        SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
        SearchableField(name="item", type=SearchFieldDataType.String),  
        SimpleField(name="price", type=SearchFieldDataType.Double),  
        SearchableField(name="category", type=SearchFieldDataType.String, filterable=True),
        SearchableField(name="content", type=SearchFieldDataType.String),
        SearchableField(name="description", type=SearchFieldDataType.String),
        SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=3072, vector_search_profile_name="myHnswProfile"),
        SearchableField(name="metadata", type=SearchFieldDataType.String, filterable=True),
    ]  

# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="contoso-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="item"),
        keywords_fields=[SemanticField(field_name="category")],
        content_fields=[SemanticField(field_name="content")]
    )
)   

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])
 
index = SearchIndex(  
    name=index_name,
    fields=fields, 
    vector_search=vector_search,
    semantic_search=semantic_search      
)  


result = index_client.create_or_update_index(index) 
print(f' {result.name} created') 

 latest-coffee-index created


In [19]:
from azure.search.documents import SearchClient

search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(items)  
print(f"Uploaded {len(items)} documents") 

Uploaded 79 documents


In [23]:
# Pure Vector Search

from azure.search.documents.models import VectorizedQuery

# query = "What are your famous bakery items?" #query_vector
query = "Can I place an order for 1 Mocha, 1 lemon cake, 1 blueberry smoothie, and 1 Chicken sandwich?"     

vector_query = VectorizedQuery(vector=generate_embeddings(query), k_nearest_neighbors=10, fields="content_vector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["category", "content", "metadata"], 
    top=10    
)  
  
for result in results:  
    #print(result)
    print(f"{result['content']}")  
    #print(f"Score: {result['@search.score']}")

 
        Item Name: Mocha
        Mocha details:
            Price: 4.5$
            Category: Coffees
            Description: Delicious combination of coffee, milk and chocolate.
    
 
        Item Name: Blueberry Smoothie
        Blueberry Smoothie details:
            Price: 5.5$
            Category: Smoothies
            Description: Delicious smoothie made with fresh blueberries.
    
 
        Item Name: Lemon Cake
        Lemon Cake details:
            Price: 4.0$
            Category: Bakery
            Description: Sweet and tangy lemon cake with a moist crumb.
    
 
        Item Name: Chocolate Smoothie
        Chocolate Smoothie details:
            Price: 6.0$
            Category: Smoothies
            Description: Rich and creamy smoothie made with chocolate.
    
 
        Item Name: Blueberry Muffin
        Blueberry Muffin details:
            Price: 2.5$
            Category: Bakery
            Description: Soft and moist muffin packed with fresh blueberries and

In [26]:
# Pure Vector Search - Perform an Exhaustive KNN exact nearest neighbor search

from azure.search.documents.models import VectorizedQuery

# query = "What are your famous bakery items?" #query_vector
query = "Can I place an order for 1 Mocha, 1 lemon cake, 1 blueberry smoothie, and 1 Chicken sandwich?"     

vector_query = VectorizedQuery(vector=generate_embeddings(query), k_nearest_neighbors=10, fields="content_vector", exhaustive=True)
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["category", "content", "metadata"], 
    top=10    
)  
  
for result in results:  
    #print(result)
    print(f"{result['content']}")  
    #print(f"Score: {result['@search.score']}")

 
        Item Name: Mocha
        Mocha details:
            Price: 4.5$
            Category: Coffees
            Description: Delicious combination of coffee, milk and chocolate.
    
 
        Item Name: Blueberry Smoothie
        Blueberry Smoothie details:
            Price: 5.5$
            Category: Smoothies
            Description: Delicious smoothie made with fresh blueberries.
    
 
        Item Name: Lemon Cake
        Lemon Cake details:
            Price: 4.0$
            Category: Bakery
            Description: Sweet and tangy lemon cake with a moist crumb.
    
 
        Item Name: Chocolate Smoothie
        Chocolate Smoothie details:
            Price: 6.0$
            Category: Smoothies
            Description: Rich and creamy smoothie made with chocolate.
    
 
        Item Name: Blueberry Muffin
        Blueberry Muffin details:
            Price: 2.5$
            Category: Bakery
            Description: Soft and moist muffin packed with fresh blueberries and

In [27]:
# Hybrid Search

from azure.search.documents.models import VectorizedQuery

# query = "What are your famous bakery items?" #query_vector
query = "Can I place an order for 1 Mocha, 1 lemon cake, 1 blueberry smoothie, and 1 Chicken sandwich?"     

vector_query = VectorizedQuery(vector=generate_embeddings(query), k_nearest_neighbors=10, fields="content_vector")
  
results = search_client.search(  
    search_text=query,  
    vector_queries= [vector_query],
    select=["category", "content", "metadata"], 
    top=10    
)  
  
for result in results:  
    #print(result)
    print(f"{result['content']}")  
    #print(f"Score: {result['@search.score']}")

 
        Item Name: Lemon Cake
        Lemon Cake details:
            Price: 4.0$
            Category: Bakery
            Description: Sweet and tangy lemon cake with a moist crumb.
    
 
        Item Name: Chicken Sandwich
        Chicken Sandwich details:
            Price: 5.5$
            Category: Sandwiches
            Description: Delicious sandwich filled with chicken and veggies.
    
 
        Item Name: Strawberry Banana Smoothie
        Strawberry Banana Smoothie details:
            Price: 5.0$
            Category: Smoothies
            Description: Refreshing smoothie made with strawberries and bananas.
    
 
        Item Name: Mocha
        Mocha details:
            Price: 4.5$
            Category: Coffees
            Description: Delicious combination of coffee, milk and chocolate.
    
 
        Item Name: Chocolate Smoothie
        Chocolate Smoothie details:
            Price: 6.0$
            Category: Smoothies
            Description: Rich and creamy smoot

In [25]:
# Semantic Hybrid Search

from azure.search.documents.models import VectorizedQuery,  QueryType, QueryCaptionType, QueryAnswerType

# query = "What are your famous bakery items?" #query_vector
query = "Can I place an order for 1 Mocha, 1 lemon cake, 1 blueberry smoothie, and 1 Chicken sandwich?"     

vector_query = VectorizedQuery(vector=generate_embeddings(query), k_nearest_neighbors=10, fields="content_vector")
  
results = search_client.search(  
    search_text=query,  
    vector_queries= [vector_query],
    select=["item", "category", "content", "metadata"],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name="contoso-semantic-config",
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=10
)  
  
for result in results:  
    #print(result)
    print(f"{result['content']}\n")  
    #print(f"Score: {result['@search.score']}") 
    #print(f"Reranker Score: {result['@search.reranker_score']}") 

 
        Item Name: Lemon Cake
        Lemon Cake details:
            Price: 4.0$
            Category: Bakery
            Description: Sweet and tangy lemon cake with a moist crumb.
    

 
        Item Name: Mocha
        Mocha details:
            Price: 4.5$
            Category: Coffees
            Description: Delicious combination of coffee, milk and chocolate.
    

 
        Item Name: Bagel
        Bagel details:
            Price: 1.5$
            Category: Bakery
            Description: Delicious, dense and chewy bread product, perfect for a quick breakfast.
    

 
        Item Name: Lemon Tea
        Lemon Tea details:
            Price: 2.5$
            Category: Tea or Chai
            Description: A tea with a lemon flavor.
    

 
        Item Name: Carrot Cake
        Carrot Cake details:
            Price: 4.0$
            Category: Bakery
            Description: Moist, sweet cake with a rich carrot and cinnamon flavor.
    

 
        Item Name: Blueberry Muffi