In [1]:
import os
import json

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [5]:
from azure.cosmos import CosmosClient, exceptions, PartitionKey

URL = os.environ['AZURE_COSMOSDB_ENDPOINT']
KEY = os.environ['AZURE_COSMOSDB_KEY']
client = CosmosClient(URL, credential=KEY)
DATABASE_NAME = 'coffeeDatabase'
CONTAINER_NAME = 'menuContainer'
database = client.get_database_client(DATABASE_NAME)
container = database.get_container_client(CONTAINER_NAME)

# Query the data  
query = "SELECT * FROM c"  
items = list(container.query_items(  
    query=query,  
    enable_cross_partition_query=True  
))  

In [6]:
items[0]

{'id': 'c001',
 'item': 'Espresso',
 'price': 2.5,
 'description': 'Rich, full-bodied coffee, with a compact crema and strong aroma.',
 'category': 'Coffees',
 '_rid': 'uBp0AIaQaQIBAAAAAAAAAA==',
 '_self': 'dbs/uBp0AA==/colls/uBp0AIaQaQI=/docs/uBp0AIaQaQIBAAAAAAAAAA==/',
 '_etag': '"00006e4a-0000-0500-0000-656976800000"',
 '_attachments': 'attachments/',
 '_ts': 1701410432}

In [9]:
# Remove the keys  
keys_to_remove = ['_rid', '_self', '_etag', '_attachments', '_ts']
for item in items: 
    for key in keys_to_remove:  
        if key in item:  
            del item[key]  
items[0]

{'id': 'c001',
 'item': 'Espresso',
 'price': 2.5,
 'description': 'Rich, full-bodied coffee, with a compact crema and strong aroma.',
 'category': 'Coffees'}

In [14]:
for item in items:
    item['content'] = f""" 
        Item Name: {item['item']}
        {item['item']} details:
            Price: {item['price']}$
            Category: {item['category']}
            Description: {item['description']}
    """
    item["metadata"] = json.dumps({'source': 'Contoso menu from Cosmos DB'})

In [18]:
from langchain.schema.document import Document
import openai

# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, engine="text-embedding-ada-002")
    embeddings = response['data'][0]['embedding']
    return embeddings

def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    if type(texts[0]) == str:
        total_tokens = sum([len(enc.encode(page)) for page in texts])
    elif type(texts[0]) == Document: 
        total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')
    
texts = [item_dict['content'] for item_dict in items]
print_embedding_cost(texts)

Total Tokens: 3591
Embedding Cost in USD: 0.001436


In [19]:
for item in items:
    item["content_vector"] = generate_embeddings(item["content"])

In [20]:
import os
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,  
) 

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

fields=[  
        SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
        SearchableField(name="item", type=SearchFieldDataType.String),  
        SimpleField(name="price", type=SearchFieldDataType.Double),  
        SearchableField(name="category", type=SearchFieldDataType.String, filterable=True),
        SearchableField(name="content", type=SearchFieldDataType.String),
        SearchableField(name="description", type=SearchFieldDataType.String),
        SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="hnsw-vector-config"),
        SearchableField(name="metadata", type=SearchFieldDataType.String, filterable=True),
    ]  

vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="hnsw-vector-config",
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction": 500,
                "efSearch": 600,
                "metric": "cosine"
            }
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="contoso-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="item"),
        prioritized_keywords_fields=[SemanticField(field_name="category")],
        prioritized_content_fields=[SemanticField(field_name="content")]
    )
)   

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

index_name = "contoso-coffee-index"
 
index = SearchIndex(  
    name=index_name,
    fields=fields, 
    vector_search=vector_search,
    semantic_settings=semantic_settings       
)  


service_endpoint = os.environ['AZURE_COGNITIVE_SEARCH_ENDPOINT']
key = os.environ['AZURE_COGNITIVE_SEARCH_KEY']
credential = AzureKeyCredential(key)

# Use the SearchIndexClient to send the create_index request  
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential) 
index_client.create_or_update_index(index)  

<azure.search.documents.indexes.models._index.SearchIndex at 0x284ed5febf0>

In [21]:
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(items)  
print(f"Uploaded {len(items)} documents") 

Uploaded 79 documents


In [23]:
# Full Text Search
# query = "What are your famous bakery items?" #query_vector
query = "Can I place an order for 1 Mocha, 1 lemon cake, 1 blueberry smoothie, and 1 Chicken sandwich?"  

search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
results = search_client.search(search_text=query,
                               select=["item", "price", "category", "content", "metadata"],
                               top=10)
  
for result in results:
    print(result['content'])
    # print(result['@search.score'])

 
        Item Name: Chicken Sandwich
        Chicken Sandwich details:
            Price: 5.5$
            Category: Sandwiches
            Description: Delicious sandwich filled with chicken and veggies.
    
 
        Item Name: Lemon Cake
        Lemon Cake details:
            Price: 4.0$
            Category: Bakery
            Description: Sweet and tangy lemon cake with a moist crumb.
    
 
        Item Name: Club Sandwich
        Club Sandwich details:
            Price: 6.0$
            Category: Sandwiches
            Description: Triple-decker sandwich with chicken, bacon, lettuce, and tomato.
    
 
        Item Name: Chicken Caesar Wrap
        Chicken Caesar Wrap details:
            Price: 6.5$
            Category: Sandwiches
            Description: Tasty wrap filled with chicken, lettuce, and Caesar dressing.
    
 
        Item Name: Carrot Cake
        Carrot Cake details:
            Price: 4.0$
            Category: Bakery
            Description: Moist, sweet c

In [26]:
# Semantic Search
# query = "What are your famous bakery items?" #query_vector
query = "Can I place an order for 1 Mocha, 1 lemon cake, 1 blueberry smoothie, and 1 Chicken sandwich?"     

results = search_client.search(search_text=query,
                               select=["item", "price", "category", "content", "metadata"],
                               query_type="semantic",
                               semantic_configuration_name="contoso-semantic-config",
                               query_speller="lexicon",
                               query_language="en-us",
                               query_caption="extractive",
                               query_answer="extractive",
                               top=10)
  
for result in results:
    #print(result)
    print(result['content'])
    # print(result['@search.score'])
    # print(result['@search.reranker_score'])

 
        Item Name: Lemon Cake
        Lemon Cake details:
            Price: 4.0$
            Category: Bakery
            Description: Sweet and tangy lemon cake with a moist crumb.
    
 
        Item Name: Mocha
        Mocha details:
            Price: 4.5$
            Category: Coffees
            Description: Delicious combination of coffee, milk and chocolate.
    
 
        Item Name: Bagel
        Bagel details:
            Price: 1.5$
            Category: Bakery
            Description: Delicious, dense and chewy bread product, perfect for a quick breakfast.
    
 
        Item Name: Lemon Tea
        Lemon Tea details:
            Price: 2.5$
            Category: Tea or Chai
            Description: A tea with a lemon flavor.
    
 
        Item Name: Carrot Cake
        Carrot Cake details:
            Price: 4.0$
            Category: Bakery
            Description: Moist, sweet cake with a rich carrot and cinnamon flavor.
    
 
        Item Name: Blueberry Muffin
   

In [29]:
# Pure Vector Search
# query = "What are your famous bakery items?" #query_vector
query = "Can I place an order for 1 Mocha, 1 lemon cake, 1 blueberry smoothie, and 1 Chicken sandwich?"     

vector = Vector(value=generate_embeddings(query), k=10, fields="content_vector")
  
results = search_client.search(  
    search_text=None,  
    vectors= [vector],
    select=["category", "content", "metadata"], 
    top=10
    
)  
  
for result in results:  
    #print(result)
    print(f"{result['content']}")  
    #print(f"Score: {result['@search.score']}")

 
        Item Name: Chicken Sandwich
        Chicken Sandwich details:
            Price: 5.5$
            Category: Sandwiches
            Description: Delicious sandwich filled with chicken and veggies.
    
 
        Item Name: Blueberry Smoothie
        Blueberry Smoothie details:
            Price: 5.5$
            Category: Smoothies
            Description: Delicious smoothie made with fresh blueberries.
    
 
        Item Name: Blueberry Muffin
        Blueberry Muffin details:
            Price: 2.5$
            Category: Bakery
            Description: Soft and moist muffin packed with fresh blueberries and a crumb topping.
    
 
        Item Name: Chocolate Smoothie
        Chocolate Smoothie details:
            Price: 6.0$
            Category: Smoothies
            Description: Rich and creamy smoothie made with chocolate.
    
 
        Item Name: Club Sandwich
        Club Sandwich details:
            Price: 6.0$
            Category: Sandwiches
            Descript

In [31]:
# Hybrid Search
# query = "What are your famous bakery items?" #query_vector
query = "Can I place an order for 1 Mocha, 1 lemon cake, 1 blueberry smoothie, and 1 Chicken sandwich?"    

vector = Vector(value=generate_embeddings(query), k=15, fields="content_vector")
  
results = search_client.search(  
    search_text=query,  
    vectors= [vector],
    select=["item", "category", "content", "metadata"],
    query_type="semantic",
    semantic_configuration_name="contoso-semantic-config",
    query_speller="lexicon",
    query_language="en-us",
    query_caption="extractive",
    query_answer="extractive",
    top=15
)  
  
for result in results:  
    #print(result)
    print(f"{result['content']}\n")  
    #print(f"Score: {result['@search.score']}") 
    #print(f"Reranker Score: {result['@search.reranker_score']}") 

 
        Item Name: Lemon Cake
        Lemon Cake details:
            Price: 4.0$
            Category: Bakery
            Description: Sweet and tangy lemon cake with a moist crumb.
    

 
        Item Name: Mocha
        Mocha details:
            Price: 4.5$
            Category: Coffees
            Description: Delicious combination of coffee, milk and chocolate.
    

 
        Item Name: Bagel
        Bagel details:
            Price: 1.5$
            Category: Bakery
            Description: Delicious, dense and chewy bread product, perfect for a quick breakfast.
    

 
        Item Name: Lemon Tea
        Lemon Tea details:
            Price: 2.5$
            Category: Tea or Chai
            Description: A tea with a lemon flavor.
    

 
        Item Name: Carrot Cake
        Carrot Cake details:
            Price: 4.0$
            Category: Bakery
            Description: Moist, sweet cake with a rich carrot and cinnamon flavor.
    

 
        Item Name: Blueberry Muffi