# Using Vector Search in Weaviate and Azure Open AI

In [20]:
# Import libraries
import os
from dotenv import load_dotenv
import json
import requests
import weaviate
from weaviate import EmbeddedOptions
from weaviate.classes.config import Configure, Property, DataType
from weaviate.classes.query import MetadataQuery, HybridVector, Move
import warnings

warnings.filterwarnings('ignore')

# Load the environment variables
load_dotenv()

aoai_key=os.environ['AZURE_OPENAI_API_KEY']
aoai_endpoint=os.environ['AZURE_OPENAI_ENDPOINT']
aoai_deployment=os.environ['AZURE_OPENAI_DEPLOYMENT']
aoai_embedding=os.environ['AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT']
aoai_embedding_3=os.environ['AZURE_OPENAI_EMBEDDINGS_3_DEPLOYMENT']

## Utility Funtions

In [22]:
# JSON print beautifier
def json_print(data):
    print(json.dumps(data, indent=2))

## Create or connecto to a Weaviate Embedded DB

In [None]:
try:
    response = requests.get("http://localhost:8079/v1/schema")
    r_err = False
except Exception as err:
    print(err)
    r_err = True

if r_err:
    client = weaviate.connect_to_embedded(
        version="1.26.1",  # e.g. version="1.26.5"
        headers={
            "X-OpenAI-BaseURL": aoai_endpoint,
            "X-Azure-Api-Key": aoai_key
        },
    )
    print("Connected to new instance")
elif response.status_code == 200:
    client = weaviate.connect_to_local(
        port=8079, 
        grpc_port=50050,    
        headers={
            "X-OpenAI-BaseURL": aoai_endpoint,
            "X-Azure-Api-Key": aoai_key
        }
    )
    print("Connected to existing instance")

print(client.is_ready())

In [None]:
# Show Weaviate DB metadata
json_print(client.get_meta())

## Create a new Collection (EU Destinations)

In [25]:
if client.collections.exists("eudestinations"):
    client.collections.delete("eudestinations")

In [None]:
# Using a Weaviate collection with text-embedding-ada-002
client.collections.create(
    "eudestinations",
    vectorizer_config=[
        Configure.NamedVectors.text2vec_azure_openai(
            name="title_vector",            
            resource_name="aoai-airlift-1",
            deployment_id=aoai_embedding,
            base_url=aoai_endpoint
        )
    ]
)

## Read Europe Touristic Destinations Dataset

_Source: https://www.kaggle.com/datasets/faizadani/european-tour-destinations-dataset_

In [None]:
file_path = '../data/eu_destinations_n.json'

with open(file_path, "r") as file:
    data = file.read()

ds = json.loads(data)
json_print(ds[0])

## Loading Embeddings into the Weaviate DB

In [None]:
collection = client.collections.get("eudestinations")

with collection.batch.dynamic() as batch:
    for i, d in enumerate(ds):
        weaviate_obj = {
            "destination": d["Destination"],
            "region": d["Region"],
            "country": d["Country"],
            "category": d["Category"],
            "annualtourists": d["Approximate Annual Tourists"],
            "foods": d["Famous Foods"],
            "language": d["Language"],
            "besttimevisit": d["Best Time to Visit"],
            "costliving": d["Cost of Living"],
            "cultural": d["Cultural Significance"],
            "description": d["Description"]
        }

        # The model provider integration will automatically vectorize the object
        batch.add_object(
            properties=weaviate_obj,
            # vector=vector  # Optionally provide a pre-obtained vector
        )

## Query a specific Collection Dataset

In [30]:
collection = client.collections.get("eudestinations")

## Search Patterns

In [40]:
# Fetch a DB object in ascending UUID order
response = collection.query.fetch_objects(
    limit=1
)
for o in response.objects:
    json_print(o.properties)

{
  "language": "German",
  "description": "Bad Gastein is famous for its thermal baths, offering relaxation and wellness in the Alps.",
  "besttimevisit": "Winter (Dec-Mar) for skiing, Summer (Jun-Sept)",
  "category": "Town",
  "destination": "Bad Gastein",
  "costliving": "Medium-high",
  "region": "Salzburg",
  "annualtourists": "200,000",
  "country": "Austria",
  "foods": "Kaiserschmarrn, Wiener Schnitzel, Apfelstrudel",
  "cultural": "A spa town known for its hot springs, skiing, and outdoor activities."
}


### Sparse and Dense Search

#### Sparse
BM25 search is one implementation of what is commonly called a 'keyword' search. Broadly speaking, it works by matching the search terms between the query and the data objects in the index.

The higher the score, the greater the relevance of the object.

In [49]:
response = collection.query.bm25(
    query="sea",
    return_properties=['country','foods','cultural','description'],
    return_metadata=MetadataQuery(score=True),
    limit=5
)

for i, o in enumerate(response.objects):    
    print(f"Rank {i + 1} - Score: {o.metadata.score}")
    json_print(o.properties)

Rank 1 - Score: 1.0217400789260864
{
  "description": null,
  "foods": "Fish, Lamb, Skyr",
  "cultural": "A unique black sand beach with basalt columns and sea stacks.",
  "country": "Iceland"
}
Rank 2 - Score: 1.0217400789260864
{
  "description": "A prime location for beachgoers and water sports enthusiasts.",
  "foods": "Mediterranean cuisine",
  "cultural": "Popular beach along the Mediterranean Sea, ideal for sunbathing and swimming.",
  "country": "Monaco"
}
Rank 3 - Score: 0.9142658710479736
{
  "description": "World-famous for the Cannes Film Festival, featuring stunning views of the Mediterranean Sea.",
  "foods": "Bouillabaisse, A\u008boli, Panisse",
  "cultural": "A glamorous coastal city known for its film festival, luxury shopping, and beautiful beaches.",
  "country": "France"
}


#### Dense
A method that uses neural network embeddings to represent and retrieve information based on semantic similarity.

Distance closer to 0: identical vectors / Distance closer to 2: Opposing vectors.

In [54]:
response = collection.query.near_text(
    query="sea",  # The model provider integration will automatically vectorize the query
    return_properties=['country','foods','cultural','description'],        
    return_metadata=MetadataQuery(distance=True),
    limit=10
)

for i, o in enumerate(response.objects):    
    print(f"Rank {i + 1} - Score: {o.metadata.distance}")
    json_print(o.properties)

Rank 1 - Score: 0.21425682306289673
{
  "description": null,
  "foods": "Fish, Lamb, Skyr",
  "cultural": "A glacial lagoon filled with icebergs, offering boat tours and stunning scenery.",
  "country": "Iceland"
}
Rank 2 - Score: 0.21670836210250854
{
  "description": null,
  "foods": "Fish, Lamb, Skyr",
  "cultural": "A unique black sand beach with basalt columns and sea stacks.",
  "country": "Iceland"
}
Rank 3 - Score: 0.218572199344635
{
  "description": "Features stunning exhibits and panoramic views of the Mediterranean.",
  "foods": "Mediterranean cuisine",
  "cultural": "World-renowned aquarium and museum dedicated to marine biology.",
  "country": "Monaco"
}
Rank 4 - Score: 0.21921074390411377
{
  "description": null,
  "foods": "Seafood, Bacalhau, Pastel de nata",
  "cultural": "Coastal city known for its canals, colorful boats, and beautiful beaches.",
  "country": "Portugal"
}
Rank 5 - Score: 0.22174537181854248
{
  "description": null,
  "foods": "Fish, Lamb, Skyr",
  "cu

In [57]:
response = collection.query.near_text(
    query="sea",  # The model provider integration will automatically vectorize the query    
    distance=0.219, 
    include_vector=True,
    return_metadata=MetadataQuery(distance=True)
)

# for obj in response.objects:
#     print(obj.properties)
#     print(obj.metadata.distance)

for i, o in enumerate(response.objects):    
    print(f"Rank {i + 1} - Score: {o.metadata.distance}")
    json_print(o.properties)
    print(f"Vector: {o.vector}")

{'language': 'Icelandic', 'description': None, 'besttimevisit': 'Summer (Jun-Aug)', 'country': 'Iceland', 'destination': 'J\x94kuls\xa0rl¢n Glacier Lagoon', 'costliving': 'Medium-high', 'region': 'Southeast Iceland', 'annualtourists': '200,000', 'category': 'Glacier Lagoon', 'foods': 'Fish, Lamb, Skyr', 'cultural': 'A glacial lagoon filled with icebergs, offering boat tours and stunning scenery.'}
0.21425682306289673
{'language': 'Icelandic', 'description': None, 'besttimevisit': 'Year-round', 'country': 'Iceland', 'destination': 'Reynisfjara Black Sand Beach', 'costliving': 'Medium-high', 'region': 'South Iceland', 'annualtourists': '100,000', 'category': 'Beach', 'foods': 'Fish, Lamb, Skyr', 'cultural': 'A unique black sand beach with basalt columns and sea stacks.'}
0.21670836210250854
{'language': 'French, Mon\x82gasque', 'description': 'Features stunning exhibits and panoramic views of the Mediterranean.', 'besttimevisit': 'Spring (April-May) or Fall (Sept-Oct)', 'country': 'Monac

### Hybrid Search (Dense and Sparse)

In [None]:
response = collection.query.hybrid(
    query="sea",
    return_metadata=MetadataQuery(score=True, explain_score=True),
    limit=3
    )

for o in response.objects:
    json_print(o.properties)
    print(o.metadata.score)
    print(o.metadata.explain_score)

In [None]:
# An alpha of 1 is a pure vector (dense) search.
# An alpha of 0 is a pure keyword (sparse) search.
response = collection.query.hybrid(
    query="sea",
    alpha=0.7,
    limit=10,
)

for o in response.objects:
    json_print(o.properties)

In [37]:
response = collection.query.hybrid(
    query="sea",
    max_vector_distance=0.4,  # Maximum threshold for the vector search component
    vector=HybridVector.near_text(
        query="Scenic view",
        move_away=Move(force=0.5, concepts=["Volcano"]),
    ),
    alpha=0.75,
    limit=10,
)

In [None]:
for o in response.objects:
    json_print(o.properties)