## Connect

In [38]:
# Import libraries
import os
from dotenv import load_dotenv
import json
import weaviate
from weaviate import EmbeddedOptions
from weaviate.classes.config import Configure, Property, DataType
from weaviate.classes.query import MetadataQuery, HybridVector, Move
import warnings

warnings.filterwarnings('ignore')

# Load the environment variables
load_dotenv()

aoai_key=os.environ['AZURE_OPENAI_API_KEY']
aoai_endpoint=os.environ['AZURE_OPENAI_ENDPOINT']
aoai_deployment=os.environ['AZURE_OPENAI_DEPLOYMENT']
aoai_embedding=os.environ['AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT']

### Utility Funtions

In [2]:
# JSON print beautifier
def json_print(data):
    print(json.dumps(data, indent=2))

### Create a Weaviate Embedded DB

In [4]:
client = weaviate.connect_to_embedded(
    version="1.26.1",  # e.g. version="1.26.5"
    headers={
        "X-OpenAI-BaseURL": aoai_endpoint,
        "X-Azure-Api-Key": aoai_key
    },
)

json_print(client.is_ready())

{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-10-19T22:46:31Z"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-10-19T22:46:31Z"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-10-19T22:46:31Z"}
{"level":"info","msg":"module offload-s3 is enabled","time":"2024-10-19T22:46:31Z"}
{"level":"info","msg":"open cluster service","servers":{"Embedded_at_8079":58481},"time":"2024-10-19T22:46:31Z"}
{"address":"10.0.10.128:58482","level":"info","msg":"starting cloud rpc server ...","time":"2024-10-19T22:46:31Z"}
{"level":"info","msg":"starting raft sub-system ...","time":"2024-10-19T22:46:31Z"}
{"address":"10.0.10.128:

true


{"action":"telemetry_push","level":"info","msg":"telemetry started","payload":"\u0026{MachineID:1759c739-822d-4aa2-8eb4-eddf0b202c78 Type:INIT Version:1.26.1 NumObjects:0 OS:linux Arch:amd64 UsedModules:[text2vec-openai]}","time":"2024-10-19T22:46:34Z"}
{"action":"bootstrap","level":"info","msg":"node reporting ready, node has probably recovered cluster from raft config. Exiting bootstrap process","time":"2024-10-19T22:46:34Z"}


{"action":"hnsw_prefill_cache_async","level":"info","msg":"not waiting for vector cache prefill, running in background","time":"2024-10-19T22:46:34Z","wait_for_cache_prefill":false}
{"level":"info","msg":"Completed loading shard eudestinations_t87rLMGzhlqI in 80.795934ms","time":"2024-10-19T22:46:34Z"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"vectors_title_vector","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-10-19T22:46:34Z","took":1946745}


In [None]:
# Show Weaviate DB metadata
json_print(client.get_meta())

### Create Collection (EU Destinations)

In [5]:
if client.collections.exists("eudestinations"):
    client.collections.delete("eudestinations")

In [None]:
client.collections.create(
    "eudestinations",
    vectorizer_config=[
        Configure.NamedVectors.text2vec_azure_openai(
            name="title_vector",            
            resource_name="aoai-airlift-1",
            deployment_id=aoai_embedding,
            base_url=aoai_endpoint
        )
    ],
    # Additional parameters not shown
)

### Read Europe Touristic Destinations

In [None]:
file_path = '../data/eu_destinations_n.json'

with open(file_path, "r") as file:
    data = file.read()

ds = json.loads(data)
json_print(ds[0])

### Create Embeddings

In [8]:
collection = client.collections.get("eudestinations")

with collection.batch.dynamic() as batch:
    for i, d in enumerate(ds):
        weaviate_obj = {
            "destination": d["Destination"],
            "region": d["Region"],
            "country": d["Country"],
            "category": d["Category"],
            "annualtourists": d["Approximate Annual Tourists"],
            "foods": d["Famous Foods"],
            "language": d["Language"],
            "besttimevisit": d["Best Time to Visit"],
            "costliving": d["Cost of Living"],
            "cultural": d["Cultural Significance"],
            "description": d["Description"]
        }

        # The model provider integration will automatically vectorize the object
        batch.add_object(
            properties=weaviate_obj,
            # vector=vector  # Optionally provide a pre-obtained vector
        )

In [None]:
# count = client.query.aggregate("eudestinations").with_meta_count().do()
# json_print(count)

### Query a specific Collection Dataset

In [None]:
collection = client.collections.get("eudestinations")

### Search Patterns

In [21]:
response = collection.query.fetch_objects(
    limit=1
)
for o in response.objects:
    json_print(o.properties)

{
  "language": "Turkish",
  "description": "A coastal town known for its beaches, ancient ruins, and boat trips to nearby islands.",
  "besttimevisit": "Spring (Apr-May) or Fall (Sep-Oct)",
  "category": "Town",
  "destination": "Fethiye",
  "costliving": "Medium-high",
  "region": "Aegean Region",
  "annualtourists": "500,000",
  "country": "Turkey",
  "foods": "Kebab, Baklava, Dolma",
  "cultural": "Coastal town with beaches and boat trips to islands."
}


In [13]:
response = collection.query.near_text(
    query="Picasso",  # The model provider integration will automatically vectorize the query
    limit=5,
    include_vector=True,
    return_metadata=MetadataQuery(distance=True)
)

for obj in response.objects:
    print(obj.properties)
    print(obj.metadata.distance)
    print(obj.vector)

In [7]:
response = collection.query.near_text(
    query="Picasso",  # The model provider integration will automatically vectorize the query    
    distance=0.20, 
    return_metadata=MetadataQuery(distance=True)
)

for obj in response.objects:
    print(obj.properties)
    print(obj.metadata.distance)

### Sparse and Dense Search

In [34]:
# Sparse
response = collection.query.bm25(
    query="Sea",
    return_metadata=MetadataQuery(score=True),
    limit=3
)

for o in response.objects:
    json_print(o.properties)
    print(o.metadata.score)

{
  "language": "French, Mon\u0082gasque",
  "description": "A prime location for beachgoers and water sports enthusiasts.",
  "besttimevisit": "Summer (June-September)",
  "country": "Monaco",
  "destination": "Larvotto Beach",
  "costliving": "Extremely high",
  "region": "Monaco-Ville",
  "annualtourists": "100,000",
  "category": "Beach",
  "foods": "Mediterranean cuisine",
  "cultural": "Popular beach along the Mediterranean Sea, ideal for sunbathing and swimming."
}
1.0217400789260864
{
  "language": "Icelandic",
  "description": null,
  "besttimevisit": "Year-round",
  "country": "Iceland",
  "destination": "Reynisfjara Black Sand Beach",
  "costliving": "Medium-high",
  "region": "South Iceland",
  "annualtourists": "100,000",
  "category": "Beach",
  "foods": "Fish, Lamb, Skyr",
  "cultural": "A unique black sand beach with basalt columns and sea stacks."
}
1.0217400789260864
{
  "language": "French",
  "description": "World-famous for the Cannes Film Festival, featuring stunn

### Hybrid Search (Dense and Sparse)

In [35]:
response = collection.query.hybrid(
    query="sea",
    return_metadata=MetadataQuery(score=True, explain_score=True),
    limit=3
    )

for o in response.objects:
    json_print(o.properties)
    print(o.metadata.score)
    print(o.metadata.explain_score)

{
  "language": "Icelandic",
  "description": null,
  "besttimevisit": "Year-round",
  "country": "Iceland",
  "destination": "Reynisfjara Black Sand Beach",
  "costliving": "Medium-high",
  "region": "South Iceland",
  "annualtourists": "100,000",
  "category": "Beach",
  "foods": "Fish, Lamb, Skyr",
  "cultural": "A unique black sand beach with basalt columns and sea stacks."
}
0.9699116945266724

Hybrid (Result Set keyword,bm25) Document 8756ead5-4ff7-488a-a08b-15cad86b6354: original score 1.0217401, normalized score: 0.3 - 
Hybrid (Result Set vector,hybridVector) Document 8756ead5-4ff7-488a-a08b-15cad86b6354: original score 0.78329164, normalized score: 0.6699117
{
  "language": "French, Mon\u0082gasque",
  "description": "A prime location for beachgoers and water sports enthusiasts.",
  "besttimevisit": "Summer (June-September)",
  "category": "Beach",
  "destination": "Larvotto Beach",
  "costliving": "Extremely high",
  "region": "Monaco-Ville",
  "annualtourists": "100,000",
  

In [36]:
# An alpha of 1 is a pure vector (dense) search.
# An alpha of 0 is a pure keyword (sparse) search.
response = collection.query.hybrid(
    query="sea",
    alpha=0.7,
    limit=10,
)

for o in response.objects:
    json_print(o.properties)

{
  "language": "Icelandic",
  "description": null,
  "besttimevisit": "Year-round",
  "category": "Beach",
  "destination": "Reynisfjara Black Sand Beach",
  "costliving": "Medium-high",
  "region": "South Iceland",
  "annualtourists": "100,000",
  "country": "Iceland",
  "foods": "Fish, Lamb, Skyr",
  "cultural": "A unique black sand beach with basalt columns and sea stacks."
}
{
  "language": "French, Mon\u0082gasque",
  "description": "A prime location for beachgoers and water sports enthusiasts.",
  "besttimevisit": "Summer (June-September)",
  "category": "Beach",
  "destination": "Larvotto Beach",
  "costliving": "Extremely high",
  "region": "Monaco-Ville",
  "annualtourists": "100,000",
  "country": "Monaco",
  "foods": "Mediterranean cuisine",
  "cultural": "Popular beach along the Mediterranean Sea, ideal for sunbathing and swimming."
}
{
  "language": "Icelandic",
  "description": null,
  "besttimevisit": "Summer (Jun-Aug)",
  "country": "Iceland",
  "destination": "J\u0094

In [44]:
response = collection.query.hybrid(
    query="sea",
    max_vector_distance=0.4,  # Maximum threshold for the vector search component
    vector=HybridVector.near_text(
        query="Scenic view",
        move_away=Move(force=0.5, concepts=["Volcano"]),
    ),
    alpha=0.75,
    limit=10,
)

In [45]:
for o in response.objects:
    json_print(o.properties)

{
  "language": "Italian",
  "description": "Scenic coastal towns connected by breathtaking hiking trails and beaches.",
  "besttimevisit": "Spring (April-May) or Fall (Sept-Oct)",
  "country": "Italy",
  "destination": "Cinque Terre",
  "costliving": "Medium-high",
  "region": "Liguria",
  "annualtourists": "3 million",
  "category": "Coastal Town",
  "foods": "Seafood, Pesto",
  "cultural": "A group of five colorful villages along the Italian Riviera, known for their stunning landscapes and hiking."
}
{
  "language": "French, German",
  "description": "A cosmopolitan city on Lake Geneva, known for its international organizations, museums, and beautiful scenery.",
  "besttimevisit": "Spring (Apr-May) or Fall (Sep-Oct)",
  "category": "City",
  "destination": "Geneva",
  "costliving": "High",
  "region": "Geneva",
  "annualtourists": "2 million",
  "country": "Switzerland",
  "foods": "Fondue, R\u0094sti, Raclette",
  "cultural": "Cosmopolitan city known for international organizations