## Connect

In [1]:
# Import libraries
import os
from dotenv import load_dotenv
import json
import weaviate
from weaviate import EmbeddedOptions
from weaviate.classes.config import Configure, Property, DataType
import warnings

warnings.filterwarnings('ignore')

# Load the environment variables
load_dotenv()

aoai_key=os.environ['AZURE_OPENAI_API_KEY']
aoai_endpoint=os.environ['AZURE_OPENAI_ENDPOINT']
aoai_deployment=os.environ['AZURE_OPENAI_DEPLOYMENT']
aoai_embedding=os.environ['AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT']



### Utility Funtions

In [2]:
# JSON print beautifier
def json_print(data):
    print(json.dumps(data, indent=2))

### Create a Weaviate Embedded DB

In [3]:
client = weaviate.connect_to_embedded(
    version="1.26.1",  # e.g. version="1.26.5"
    headers={
        "X-OpenAI-BaseURL": aoai_endpoint,
        "X-Azure-Api-Key": aoai_key
    },
)

json_print(client.is_ready())

{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-10-19T21:08:42Z"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-10-19T21:08:42Z"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-10-19T21:08:42Z"}
{"level":"info","msg":"module offload-s3 is enabled","time":"2024-10-19T21:08:42Z"}
{"level":"info","msg":"open cluster service","servers":{"Embedded_at_8079":54439},"time":"2024-10-19T21:08:42Z"}
{"address":"10.0.3.52:54440","level":"info","msg":"starting cloud rpc server ...","time":"2024-10-19T21:08:42Z"}
{"level":"info","msg":"starting raft sub-system ...","time":"2024-10-19T21:08:42Z"}
{"address":"10.0.3.52:5443

true


{"docker_image_tag":"unknown","level":"info","msg":"configured versions","server_version":"1.26.1","time":"2024-10-19T21:08:44Z"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50050","time":"2024-10-19T21:08:44Z"}
{"action":"restapi_management","docker_image_tag":"unknown","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-10-19T21:08:44Z"}
{"address":"10.0.3.52:54439","level":"info","msg":"current Leader","time":"2024-10-19T21:08:44Z"}
{"action":"bootstrap","level":"info","msg":"node reporting ready, node has probably recovered cluster from raft config. Exiting bootstrap process","time":"2024-10-19T21:08:44Z"}


{"action":"telemetry_push","level":"info","msg":"telemetry started","payload":"\u0026{MachineID:3ec1ed07-0566-4590-b922-abf5737b8b4a Type:INIT Version:1.26.1 NumObjects:0 OS:linux Arch:amd64 UsedModules:[text2vec-openai]}","time":"2024-10-19T21:08:44Z"}
{"action":"hnsw_prefill_cache_async","level":"info","msg":"not waiting for vector cache prefill, running in background","time":"2024-10-19T21:08:44Z","wait_for_cache_prefill":false}
{"level":"info","msg":"Completed loading shard eudestinations_t87rLMGzhlqI in 79.119609ms","time":"2024-10-19T21:08:44Z"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"vectors_title_vector","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-10-19T21:08:44Z","took":2004391}


In [4]:
# Show Weaviate DB metadata
json_print(client.get_meta())

{
  "hostname": "http://127.0.0.1:8079",
  "modules": {
    "generative-openai": {
      "documentationHref": "https://platform.openai.com/docs/api-reference/completions",
      "name": "Generative Search - OpenAI"
    },
    "qna-openai": {
      "documentationHref": "https://platform.openai.com/docs/api-reference/completions",
      "name": "OpenAI Question & Answering Module"
    },
    "ref2vec-centroid": {},
    "reranker-cohere": {
      "documentationHref": "https://txt.cohere.com/rerank/",
      "name": "Reranker - Cohere"
    },
    "text2vec-cohere": {
      "documentationHref": "https://docs.cohere.ai/embedding-wiki/",
      "name": "Cohere Module"
    },
    "text2vec-huggingface": {
      "documentationHref": "https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task",
      "name": "Hugging Face Module"
    },
    "text2vec-openai": {
      "documentationHref": "https://platform.openai.com/docs/guides/embeddings/what-are-embeddings",
      "nam

### Create Collection (EU Destinations)

In [5]:
if client.collections.exists("eudestinations"):
    client.collections.delete("eudestinations")

In [6]:
client.collections.create(
    "eudestinations",
    vectorizer_config=[
        Configure.NamedVectors.text2vec_azure_openai(
            name="title_vector",            
            resource_name="aoai-airlift-1",
            deployment_id=aoai_embedding,
            base_url=aoai_endpoint
        )
    ],
    # Additional parameters not shown
)

<weaviate.collections.collection.sync.Collection at 0x7d0bad60a9f0>

{"action":"hnsw_prefill_cache_async","level":"info","msg":"not waiting for vector cache prefill, running in background","time":"2024-10-19T00:24:17Z","wait_for_cache_prefill":false}
{"level":"info","msg":"Created shard eudestinations_t87rLMGzhlqI in 1.101925ms","time":"2024-10-19T00:24:17Z"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"vectors_title_vector","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-10-19T00:24:17Z","took":39143}


### Read Europe Touristic Destinations

In [7]:
file_path = '../data/eu_destinations_n.json'

with open(file_path, "r") as file:
    data = file.read()

ds = json.loads(data)
json_print(ds[0])

{
  "Destination": "Rome",
  "Region": "Lazio",
  "Country": "Italy",
  "Category": "City",
  "Approximate Annual Tourists": "14 million",
  "Famous Foods": "Pizza, Pasta, Gelato",
  "Language": "Italian",
  "Best Time to Visit": "Spring (April-May) or Fall (Sept-Oct)",
  "Cost of Living": "Medium-high",
  "Cultural Significance": "The capital city, known for its historical landmarks like the Colosseum, Vatican City, and Pantheon.",
  "Description": "A hub of ancient history and modern culture, with rich traditions, art, and landmarks."
}


### Create Embeddings

In [8]:
collection = client.collections.get("eudestinations")

with collection.batch.dynamic() as batch:
    for i, d in enumerate(ds):
        weaviate_obj = {
            "destination": d["Destination"],
            "region": d["Region"],
            "country": d["Country"],
            "category": d["Category"],
            "annualtourists": d["Approximate Annual Tourists"],
            "foods": d["Famous Foods"],
            "language": d["Language"],
            "besttimevisit": d["Best Time to Visit"],
            "costliving": d["Cost of Living"],
            "cultural": d["Cultural Significance"],
            "description": d["Description"]
        }

        # The model provider integration will automatically vectorize the object
        batch.add_object(
            properties=weaviate_obj,
            # vector=vector  # Optionally provide a pre-obtained vector
        )

In [None]:
# count = client.query.aggregate("eudestinations").with_meta_count().do()
# json_print(count)

In [5]:
from weaviate.classes.query import MetadataQuery

collection = client.collections.get("eudestinations")

response = collection.query.near_text(
    query="Picasso",  # The model provider integration will automatically vectorize the query
    limit=5,
    return_metadata=MetadataQuery(distance=True)
)

In [6]:
for obj in response.objects:
    print(obj.properties)
    print(obj.metadata.distance)

{'language': 'Spanish', 'description': 'Birthplace of Picasso, featuring stunning beaches and a lively port.', 'besttimevisit': 'Spring (April-May) or Fall (Sept-Oct)', 'category': 'City', 'destination': 'M\xa0laga', 'costliving': 'Medium', 'region': 'Andalusia', 'annualtourists': '2 million', 'country': 'Spain', 'foods': 'Tapas, Gazpacho, Espetos', 'cultural': 'A coastal city known for its beautiful beaches, historic center, and Picasso Museum.'}
0.18377715349197388
{'language': 'Spanish (and Basque)', 'description': 'An industrial city turned cultural hotspot, home to the iconic Guggenheim Museum.', 'besttimevisit': 'Spring (April-May) or Fall (Sept-Oct)', 'country': 'Spain', 'destination': 'Bilbao', 'costliving': 'Medium-high', 'region': 'Basque Country', 'annualtourists': '1.5 million', 'category': 'City', 'foods': 'Pintxos, Bacalao al pil-pil', 'cultural': 'A modern city known for its Guggenheim Museum, vibrant atmosphere, and delicious pintxos.'}
0.18603205680847168
{'language': 

In [7]:
response = collection.query.near_text(
    query="Picasso",  # The model provider integration will automatically vectorize the query    
    distance=0.20, 
    return_metadata=MetadataQuery(distance=True)
)

In [8]:
for obj in response.objects:
    print(obj.properties)
    print(obj.metadata.distance)

{'language': 'Spanish', 'description': 'Birthplace of Picasso, featuring stunning beaches and a lively port.', 'besttimevisit': 'Spring (April-May) or Fall (Sept-Oct)', 'category': 'City', 'destination': 'M\xa0laga', 'costliving': 'Medium', 'region': 'Andalusia', 'annualtourists': '2 million', 'country': 'Spain', 'foods': 'Tapas, Gazpacho, Espetos', 'cultural': 'A coastal city known for its beautiful beaches, historic center, and Picasso Museum.'}
0.18377715349197388
{'language': 'Spanish (and Basque)', 'description': 'An industrial city turned cultural hotspot, home to the iconic Guggenheim Museum.', 'besttimevisit': 'Spring (April-May) or Fall (Sept-Oct)', 'country': 'Spain', 'destination': 'Bilbao', 'costliving': 'Medium-high', 'region': 'Basque Country', 'annualtourists': '1.5 million', 'category': 'City', 'foods': 'Pintxos, Bacalao al pil-pil', 'cultural': 'A modern city known for its Guggenheim Museum, vibrant atmosphere, and delicious pintxos.'}
0.18603205680847168


In [9]:
response = collection.query.fetch_objects(
    include_vector=True,
    limit=1
)
print(response.objects[0].vector["title_vector"])

KeyError: 'default'

[0.00431612366810441, -0.03275689482688904, -0.008665810339152813, -0.008484573103487492, -0.02239282801747322, 0.04306726157665253, -0.06202330440282822, 0.008182511664927006, 0.002503754571080208, -0.03989897295832634, -0.003665013238787651, 0.03734822943806648, -0.028997907415032387, 0.008712797425687313, -0.012035474181175232, -0.013257144950330257, 0.007236051838845015, -0.022285429760813713, 0.0009212876902893186, 0.0032438053749501705, -0.010827228426933289, 0.01760011911392212, 0.005735813174396753, -0.011350801214575768, -0.005675400607287884, 0.0383148267865181, -0.000148618477396667, -0.007074952591210604, -0.018942615017294884, 0.0016974180471152067, 0.010458041913807392, 0.018687540665268898, -0.00020210853836033493, -0.01966756209731102, -0.018029717728495598, 0.01557295024394989, -0.0075179762206971645, -0.013478657230734825, 0.02357422560453415, 0.0059539685025811195, 0.006534597836434841, 0.0007018735632300377, -0.010310366749763489, 0.02305065095424652, 0.014901702292