In [2]:
%pip install -q azure-search-documents python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [3]:
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SearchIndex,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
)
import os
from dotenv import load_dotenv

load_dotenv()

index_name = os.getenv("SEARCH_INDEX_NAME")
vector_search_profile_name = f"{index_name}_profile"

fields = [
    SimpleField(name="id",               type=SearchFieldDataType.String, key=True),
    SearchableField(name="review_text",  type=SearchFieldDataType.String, analyzer_name="en.lucene"),
    SearchableField(name="review_title", type=SearchFieldDataType.String, analyzer_name="en.lucene"),
    SimpleField(name="dateAdded",        type=SearchFieldDataType.DateTimeOffset, filterable=True, sortable=True),
    SimpleField(name="city",             type=SearchFieldDataType.String,         filterable=True),
    SimpleField(name="hotel_name",       type=SearchFieldDataType.String,         filterable=True),
    SimpleField(name="hotel_state",      type=SearchFieldDataType.String,         filterable=True),
    SearchField(
        name="embedding",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        vector_search_dimensions=1536,  # Adjust this to match your embedding size
        vector_search_profile_name=vector_search_profile_name,
    ),
]

# Vector search configuration
hnsw_algorithm_name = f"{index_name}_hnsw"
vector_search = VectorSearch(
        algorithms=[HnswAlgorithmConfiguration(name=hnsw_algorithm_name)],
        profiles=[
            VectorSearchProfile(
                name=vector_search_profile_name,
                algorithm_configuration_name=hnsw_algorithm_name,
            )
        ],
    )

semantic_config = SemanticConfiguration(
    name="ps-hotels-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="review_title"),
        content_fields=[SemanticField(field_name="review_text")],
        keywords_fields=[
            SemanticField(field_name="city"),
            SemanticField(field_name="hotel_name"),
            SemanticField(field_name="hotel_state")
        ]
    )
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)

In [4]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient

import os
from dotenv import load_dotenv

load_dotenv()

# Azure Search service details
service_name = os.getenv("SERVICE_NAME")
admin_key = os.getenv("SEARCH_ADMIN_KEY")
index_name = os.getenv("SEARCH_INDEX_NAME")

# Initialize the search index client
endpoint = f"https://{service_name}.search.windows.net/"
print(endpoint)
credential = AzureKeyCredential(admin_key)
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)

https://hotelservice.search.windows.net/


In [5]:
import os
import json
from azure.search.documents import SearchClient
from azure.core.pipeline.transport import RequestsTransport


# Create the index
index_client.create_or_update_index(index)

# Initialize the search client
search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)

# Function to read and process JSON files
def process_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    # Ensure the embedding is a list of floats
    data['embedding'] = [float(x) for x in data['embedding']]
    
    return data

# Directory containing the JSON files
directory = '../data/embedded'

# List to store all documents
documents = []

# Iterate through all JSON files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.json'):
        print(f"Processing {filename}")
        file_path = os.path.join(directory, filename)
        document = process_json_file(file_path)
        documents.append(document)

        # If we have 1000 documents, upload them in batch
        if len(documents) == 1000:
            result = search_client.upload_documents(documents)
            print(f"Uploaded {len(result)} documents")
            documents = []  # Clear the list for the next batch

# Upload any remaining documents
if documents:
    result = search_client.upload_documents(documents)
    print(f"Uploaded {len(result)} documents")

print("Data upload completed.")

Processing review_73.json
Processing review_772.json
Processing review_288.json
Processing review_322.json
Processing review_637.json
Processing review_267.json
Processing review_908.json
Processing review_375.json
Processing review_725.json
Processing review_24.json
Processing review_230.json
Processing review_660.json
Processing review_549.json
Processing review_119.json
Processing review_32.json
Processing review_733.json
Processing review_363.json
Processing review_699.json
Processing review_676.json
Processing review_226.json
Processing review_4.json
Processing review_949.json
Processing review_334.json
Processing review_764.json
Processing review_65.json
Processing review_271.json
Processing review_621.json
Processing review_508.json
Processing review_158.json
Processing review_436.json
Processing review_965.json
Processing review_820.json
Processing review_573.json
Processing review_123.json
Processing review_49.json
Processing review_748.json
Processing review_318.json
Processi