# Searcing across Elastic, Azure Search (basic), Azure Search (semantic), and Azure Search (embeddings)

## Elastic Search

### Setup
Ensure you have a docker daemon running. Then run the following commands to start an instance of Elastic Search.
```sh
docker network create es-net-01

docker pull docker.elastic.co/elasticsearch/elasticsearch:8.8.2
docker run -d --name es-node-01 --net es-net-01 -p 9200:9200 -p 9300:9300 \
           -e "discovery.type=single-node" \
           -e "xpack.security.enabled=false" \
           -e "http.cors.enabled=true" \
           -e "http.cors.allow-origin=http://localhost:8080" \
           docker.elastic.co/elasticsearch/elasticsearch:8.8.2

docker pull cars10/elasticvue
docker run --name es-ui --net es-net-01 -p 8080:8080 -d cars10/elasticvue
```

In [None]:
# Packages for elastic search
%pip install elasticsearch
%pip install python-dotenv
%pip install azure-search --pre
%pip install azure-search-documents --pre
%pip install azure-core
%pip install pandas
%pip install openai
%pip install tenacity

In [None]:
# Load environment variables
import os
import json
import pandas as pd
from pprint import pprint

from dotenv import load_dotenv
load_dotenv()

# Read the data.csv file we will use in this example
dataset = pd.read_csv('data.csv')
dataset = dataset.to_json(orient='records')
dataset = json.loads(dataset)


# Pretty print the data we will be indexing
pprint(dataset, indent=2, sort_dicts=False)

### Create and Populate
Create an index & populate data

In [None]:
from elasticsearch import Elasticsearch

# Connect to the elastic search cluster at localhost:9200 with no authentication
es = Elasticsearch(['http://localhost:9200/'])

# Check that the cluster is up and running
es.ping()

# Create an index called animals
es_index_name = 'animals'

# Display error message if the cluster is not up and running
if not es.ping():    
    raise ValueError('Connection failed. Check if the container is running ')
else:
    # try to create an index called animals
    try:
        # Check if an index called animals exists, if it does delete it and then create a new one
        if es.indices.exists(index=es_index_name):
            es.indices.delete(index=es_index_name)
            es.indices.create(index=es_index_name)
        else:
            es.indices.create(index=es_index_name)

        # Load the dataset into the index
        for i in range(len(dataset)):
            es.index(index=es_index_name, id=dataset[i]['id'], document=dataset[i])
    except:
        raise 

### Search and Print

In [None]:
import json
from pprint import pprint

def perform_es(query):
    print('Searching for documents with query: ', query)
    results = es.search(index=es_index_name, q=query)
    pprint(results['hits']['hits'], indent=4, sort_dicts=False)

perform_es('Oct*')
perform_es('*has tentacles*')

## Azure Search

### Setup
Follow these instructions to create an Azure Search instance. https://learn.microsoft.com/en-us/azure/search/search-create-service-portal

In [None]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (
    CorsOptions,
    SearchIndex,
    SemanticConfiguration,
    SearchFieldDataType,
    SearchField,    
    SimpleField,  
    SearchableField, 
    PrioritizedFields,
    SemanticField,
    SemanticSettings,
)

from azure.core.exceptions import (
    ResourceNotFoundError
)

from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
)  
    

# Get the service name (short name) and admin API key from the environment
service_name = os.environ["AZURE_SEARCH_ENDPOINT"]
key = os.environ["AZURE_SEARCH_API_KEY"]
endpoint = "https://{}.search.windows.net/".format(service_name)

# Create a service client
azure_search_client = SearchIndexClient(endpoint, AzureKeyCredential(key))

### Azure Search (Basic)

#### Create and Populate Index (Basic)

In [None]:
# Basic search index
basic_index_name = "animals-index-basic"
basic_index = SearchIndex(
    name = basic_index_name,
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="name", type=SearchFieldDataType.String),        
        SearchableField(name="animal", type=SearchFieldDataType.String),
        SearchableField(name="song", type=SearchFieldDataType.String)
    ],
    scoring_profiles = [],
    cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
)


# Create an index
try:
    # Check if an index called animals exists, if it does delete it and then create a new one
    if azure_search_client.get_index(basic_index_name):
        azure_search_client.delete_index(basic_index_name)

except:
    if ResourceNotFoundError:
        pass
    else:
        raise

try:
    azure_search_client.create_index(basic_index)
    searchClient = SearchClient(endpoint, basic_index_name, AzureKeyCredential(key))

    # modify dataset such that all id fields are strings
    for i in range(len(dataset)):
        dataset[i]['id'] = str(dataset[i]['id'])
        searchClient.upload_documents(documents=[dataset[i]])
except:
    raise

#### Search and Print (Basic)

In [None]:
def perform_basic_search(query):
    print('Searching for documents with query: ', query)

    # Call the search method on the search client and pass in the query
    results = searchClient.search(search_text=query)
    # Iterate over the results and print the document id and the text score
    for result in results:
        print('Document id: ', result['id'], ' Score: ', result['@search.score'])
        pprint(result, indent=4)
    if results.get_count() == None or results.get_count() == 0:
        print('No results found')


perform_basic_search('Oct*')
perform_basic_search('has tentacles*')


### Azure Search (Semantic)

#### Create and Populate Index (Semantic)

In [None]:
semantic_index_name = "animals-index-semantic"
semantic_config_name = "my-semantic-config"
semantic_index = SearchIndex(
    name = semantic_index_name,
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="name", type=SearchFieldDataType.String),        
        SearchableField(name="animal", type=SearchFieldDataType.String),
        SearchableField(name="song", type=SearchFieldDataType.String)
    ],
    scoring_profiles = [],
    cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60),
    semantic_settings = SemanticSettings(configurations=[SemanticConfiguration(
        name=semantic_config_name,
        prioritized_fields=PrioritizedFields(
            # A title field should be a concise description of the document, 
            # ideally a string that is under 25 words. 
            # This could be the title of the document, name of the product, 
            # or item in your search index. If you don't have a title in 
            # your search index, leave this field blank.
            # In this case the title field will be animal as it describes the type.
            title_field=SemanticField(field_name="animal"),
            # Kevword fields should be a list of keywords. such as the tags on a document, 
            # or a descriptive term. such as the category of an item. 
            # Make sure to list kevword fields in order of priority because lower priority 
            # fields mav aet truncated or janored.
            prioritized_keywords_fields=[SemanticField(field_name="name")],
            prioritized_content_fields=[SemanticField(field_name="song")]
        )
    )])
)


# Create an index
try:
    # Check if an index called animals exists, if it does delete it and then create a new one
    if azure_search_client.get_index(semantic_index_name):
        azure_search_client.delete_index(semantic_index_name)

except:
    if ResourceNotFoundError:
        pass
    else:
        raise

try:
    azure_search_client.create_index(semantic_index)
    semantic_search_client = SearchClient(endpoint, semantic_index_name, AzureKeyCredential(key))

    # modify dataset such that all id fields are strings
    for i in range(len(dataset)):
        dataset[i]['id'] = str(dataset[i]['id'])
        semantic_search_client.upload_documents(documents=[dataset[i]])
except:
    raise

#### Search and Print (Semantic)

In [None]:
def perform_semantic_search(query):
    print('Searching for documents with query: ', query)

    semantic_search_client = SearchClient(endpoint, semantic_index_name, AzureKeyCredential(key))

    results = semantic_search_client.search(search_text=query, semantic_configuration_name=semantic_config_name)

    # print message if no results are found
    if results.get_count() == None or results.get_count() == 0:
        print('No results found')

    for result in list(results):
        print('Document id: ', result['id'], ' Score: ', result['@search.score'])
        pprint(result, indent=4)


perform_semantic_search('Oct*')
perform_semantic_search('has tentacles')


### Azure Search (Embeddings)

#### Create Embeddings

In [None]:
import os
import openai

openai.organization = os.getenv("OPENAI_ORGANIZATION") 
openai.api_key = os.getenv("OPENAI_API_KEY")  
# openai.api_type = "azure"  
# openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
# openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
# openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")  

# Import required libraries  
import os  
import json  
import openai  
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    VectorSearchAlgorithmConfiguration,  
)  
  

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, engine="text-embedding-ada-002")
    embeddings = response['data'][0]['embedding']
    return embeddings


# Generate embeddings for the dataset
for i in range(len(dataset)):
    animal_embeddings = generate_embeddings(dataset[i]['animal'])
    # song_embeddings = generate_embeddings(dataset[i]['song'])
    name_embeddings = generate_embeddings(dataset[i]['name'])
    # dataset[i]['animal_embeddings'] = animal_embeddings
    dataset[i]['animalVector'] = animal_embeddings
    # dataset[i]['song_embeddings'] = song_embeddings
    # dataset[i]['songVector'] = song_embeddings
    # dataset[i]['name_embeddings'] = name_embeddings
    dataset[i]['nameVector'] = name_embeddings

# print the dataset with the embeddings (only top 1)
pprint(dataset, indent=0, sort_dicts=False, width=100)


# import os  
# import json  
# import openai  
# from dotenv import load_dotenv  
# from tenacity import retry, wait_random_exponential, stop_after_attempt  
# from azure.search.documents.indexes.models import (  
     
#     SemanticConfiguration,  
#     PrioritizedFields,  
#     SemanticField,    
#     SemanticSettings,  
#     VectorSearch,  
#     VectorSearchAlgorithmConfiguration,  
# ) 

# # See: https://github.com/Azure/cognitive-search-vector-pr/blob/main/demo-python/code/azure-search-vector-python-sample.ipynb


In [None]:
vector_index_name = "animals-index-vector"
vector_semantic_config_name = "my-vector-semantic-config"
vector_config_name = "my-vector-config"
vector_fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="name", type=SearchFieldDataType.String),        
        SearchableField(name="animal", type=SearchFieldDataType.String),
        SearchableField(name="song", type=SearchFieldDataType.String),
        SearchField(name="nameVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration=vector_config_name),
        SearchField(name="animalVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration=vector_config_name),
        # SearchField(name="songVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        #         searchable=True, vector_search_dimensions=1536, vector_search_configuration=vector_config_name),
    ]
vector_index = SearchIndex(
    name = vector_index_name,
    fields = vector_fields,
    scoring_profiles = [],
    cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60),
    semantic_settings = SemanticSettings(configurations=[SemanticConfiguration(
        name=vector_semantic_config_name,
        prioritized_fields=PrioritizedFields(
            # A title field should be a concise description of the document, 
            # ideally a string that is under 25 words. 
            # This could be the title of the document, name of the product, 
            # or item in your search index. If you don't have a title in 
            # your search index, leave this field blank.
            # In this case the title field will be animal as it describes the type.
            title_field=SemanticField(field_name="animal"),
            # Kevword fields should be a list of keywords. such as the tags on a document, 
            # or a descriptive term. such as the category of an item. 
            # Make sure to list kevword fields in order of priority because lower priority 
            # fields mav aet truncated or janored.
            prioritized_keywords_fields=[SemanticField(field_name="name")],
            prioritized_content_fields=[SemanticField(field_name="song")]
        )
    )]),
    vector_search=VectorSearch(
     algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name=vector_config_name,
            kind="hnsw",
            hnsw_parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            },            
        )
    ])
)


# Create an index
try:
    # Check if an index called animals exists, if it does delete it and then create a new one
    if azure_search_client.get_index(vector_index_name):
        azure_search_client.delete_index(vector_index_name)

except:
    if ResourceNotFoundError:
        pass
    else:
        raise

try:
    azure_search_client.create_index(vector_index)
    vector_search_client = SearchClient(endpoint, vector_index_name, AzureKeyCredential(key))

    # modify dataset such that all id fields are strings
    for i in range(len(dataset)):
        dataset[i]['id'] = str(dataset[i]['id'])
        vector_search_client.upload_documents(documents=[dataset[i]])
except:
    raise

#### Search and Print (Embeddings)

In [None]:
def perform_vector_search(query):
    print('Searching for documents with query: ', query)
    vector_query = generate_embeddings(query)
    vector_search_client = SearchClient(endpoint, vector_index_name, AzureKeyCredential(key))

    results = vector_search_client.search(
        search_text=None,
        vector=vector_query,
        top_k=1,
        vector_fields="animalVector",
        )
                                        

    # print message if no results are found
    # if results.
    #     print('No results found')

    for result in results:
        print(' Animal: ', result['animal'],
              ', Name: ', result['name'],
              ', Document id: ', result['id'], 
              ', Score: ', result['@search.score'],
              )
        # pprint(result, indent=4)
        


perform_vector_search('Oct')
perform_vector_search('has tentacles') 