# Azure Open AI - surveys - part 1 Embeddings

In [1]:
#%pip install azure-search-documents --pre --upgrade

In [2]:
import json
import math
import openai
import os
import pandas as pd
import pickle
import pytz
import requests
import sys
import tiktoken
import time

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    VectorSearch,
    SimpleField,
    SemanticSettings,
    SemanticField,
    SemanticConfiguration,
    SearchIndex,
    SearchFieldDataType,
    SearchField,
    SearchableField,
    PrioritizedFields,
    HnswVectorSearchAlgorithmConfiguration,
)

from datetime import datetime
from dotenv import load_dotenv
from tqdm import tqdm

In [3]:
sys.version

'3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]'

In [4]:
local_tz = pytz.timezone(requests.get("https://ipinfo.io").json()["timezone"])
print("Local time:", datetime.now(local_tz).strftime("%d-%b-%Y %H:%M:%S"))

Local time: 07-Sep-2023 18:40:34


In [5]:
print("Open AI version:", openai.__version__)

Open AI version: 0.28.0


In [6]:
load_dotenv("azure.env")

openai.api_type: str = "azure"
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")

acs_endpoint = os.getenv("AZURE_COGNITIVE_SEARCH_ENDPOINT")
acs_key = os.getenv("AZURE_COGNITIVE_SEARCH_API_KEY")

In [7]:
# Azure Open AI embeddings model to use
embeddings_engine = "text-embedding-ada-002"

- Vector search is in public preview
- Model name: text-embedding-ada-002
- Model version: 2
- API version: 2023-05-15

In [8]:
# Azure Cognitive Search index name to create
index_name = "surveys"

## 0. Azure Cognitive Search vector store
<img src="https://github.com/retkowsky/images/blob/master/vector_search_architecture.png?raw=true">

## 1. Data

In [9]:
EXCEL_FILE = "surveys.xlsx"

!ls $EXCEL_FILE -lh

-rwxrwxrwx 1 root root 160K Sep  7 16:40 surveys.xlsx


In [10]:
df = pd.read_excel(EXCEL_FILE)

In [11]:
df

Unnamed: 0,id,company,comments,month,year
0,100,Réassurez-moi,Ravi ! La recherche et la comparaison des mutu...,8,2019
1,101,Réassurez-moi,Très satisfaite ! Un conseiller à l'écoute et ...,8,2019
2,102,Réassurez-moi,Je vous avais contacté pour la mise en place e...,8,2019
3,103,Réassurez-moi,Quelques mots sur Réassurez-moi. Je suis très ...,8,2019
4,104,Réassurez-moi,Votre cabinet m'a permis de pouvoir bénéficier...,8,2019
...,...,...,...,...,...
1055,1155,Néoliane Santé,très facile d accès personnel sympathique accu...,10,2018
1056,1156,Néoliane Santé,je suis passer par senttianne jai recu un accu...,10,2018
1057,1157,Néoliane Santé,je suis chez neoliane depuis 2016 et suis tres...,10,2018
1058,1158,Néoliane Santé,j'avais une mutuelle en 2017 et je n'étais pas...,10,2018


In [12]:
df["id"] = df["id"].astype(str)
df["year"] = df["year"].astype(str)
df["month"] = df["month"].astype(str)
df["text"] = " Year=" + df["year"] + " month=" + df["month"] + " " + "company:" + df["company"] + " comment: " + df["comments"]

In [13]:
df.head(5)

Unnamed: 0,id,company,comments,month,year,text
0,100,Réassurez-moi,Ravi ! La recherche et la comparaison des mutu...,8,2019,Year=2019 month=8 company:Réassurez-moi comme...
1,101,Réassurez-moi,Très satisfaite ! Un conseiller à l'écoute et ...,8,2019,Year=2019 month=8 company:Réassurez-moi comme...
2,102,Réassurez-moi,Je vous avais contacté pour la mise en place e...,8,2019,Year=2019 month=8 company:Réassurez-moi comme...
3,103,Réassurez-moi,Quelques mots sur Réassurez-moi. Je suis très ...,8,2019,Year=2019 month=8 company:Réassurez-moi comme...
4,104,Réassurez-moi,Votre cabinet m'a permis de pouvoir bénéficier...,8,2019,Year=2019 month=8 company:Réassurez-moi comme...


In [14]:
df.dtypes

id          object
company     object
comments    object
month       object
year        object
text        object
dtype: object

## 2. Generating text embeddings with Azure Open AI

### Vectors embeddings

In [15]:
print("Embedding engine:", embeddings_engine)

Embedding engine: text-embedding-ada-002


In [16]:
def openai_text_embeddings(text):
    """
    Generating embeddings from text using Azure Open AI
    Input: text
    Output: text embeddings
    """
    embeddings = openai.Embedding.create(
        input=text,
        deployment_id=embeddings_engine,
    )
    embeddings = embeddings["data"][0]["embedding"]

    return embeddings

In [17]:
emb = openai_text_embeddings("My name is James Bond")
emb[:5]

[-0.03617486730217934,
 -0.005520837381482124,
 -0.007070655468851328,
 -0.030174769461154938,
 0.0020399712957441807]

In [18]:
print("Size of the embeddings =", len(emb))

Size of the embeddings = 1536


In [19]:
print("Running the embedding process...")
df["embed_text"] = None

with tqdm(total=len(df)) as pbar:
    def apply_embedding(x):
        """
        Azure Open AI text embedding
        """
        global pbar
        embedding = openai_text_embeddings(x["text"])
        pbar.update(1)  # Update the progress bar
        return embedding
    df["embed_text"] = df.apply(apply_embedding, axis=1)

Running the embedding process...


100%|██████████| 1060/1060 [02:09<00:00,  8.17it/s]


### Saving the documents (initial data + embeddings) into a file

In [21]:
df

Unnamed: 0,id,company,comments,month,year,text,embed_text
0,100,Réassurez-moi,Ravi ! La recherche et la comparaison des mutu...,8,2019,Year=2019 month=8 company:Réassurez-moi comme...,"[-0.0184832401573658, -0.008551266975700855, 0..."
1,101,Réassurez-moi,Très satisfaite ! Un conseiller à l'écoute et ...,8,2019,Year=2019 month=8 company:Réassurez-moi comme...,"[-0.019603285938501358, -0.006490982137620449,..."
2,102,Réassurez-moi,Je vous avais contacté pour la mise en place e...,8,2019,Year=2019 month=8 company:Réassurez-moi comme...,"[-0.016711777076125145, -0.004223286639899015,..."
3,103,Réassurez-moi,Quelques mots sur Réassurez-moi. Je suis très ...,8,2019,Year=2019 month=8 company:Réassurez-moi comme...,"[-0.0205739364027977, -0.009005204774439335, -..."
4,104,Réassurez-moi,Votre cabinet m'a permis de pouvoir bénéficier...,8,2019,Year=2019 month=8 company:Réassurez-moi comme...,"[-0.008775348775088787, -0.006393004208803177,..."
...,...,...,...,...,...,...,...
1055,1155,Néoliane Santé,très facile d accès personnel sympathique accu...,10,2018,Year=2018 month=10 company:Néoliane Santé co...,"[-0.007273130118846893, 0.0017582291038706899,..."
1056,1156,Néoliane Santé,je suis passer par senttianne jai recu un accu...,10,2018,Year=2018 month=10 company:Néoliane Santé co...,"[-0.015039212070405483, -0.01120542362332344, ..."
1057,1157,Néoliane Santé,je suis chez neoliane depuis 2016 et suis tres...,10,2018,Year=2018 month=10 company:Néoliane Santé co...,"[-0.01934194564819336, -0.01205415278673172, -..."
1058,1158,Néoliane Santé,j'avais une mutuelle en 2017 et je n'étais pas...,10,2018,Year=2018 month=10 company:Néoliane Santé co...,"[-0.012488491833209991, -0.015272384509444237,..."


In [22]:
documents = df.to_dict(orient="records")
print("Number of documents =", len(documents))

Number of documents = 1060


In [23]:
# Saving the documents into a pkl file
PKL_DIR = "embeddings"
PKL_FILE = "surveys.pkl"

os.makedirs(PKL_DIR, exist_ok=True)

print("Saving documents...")
with open(os.path.join(PKL_DIR, PKL_FILE), 'wb') as f:
    pickle.dump(documents, f)
print("Done")

os.listdir(PKL_DIR)

Saving documents...
Done


['surveys.pkl']

## 5. Azure Cognitive Search functions

In [24]:
def delete_index(index_name):
    """
    Deleting an Azure Cognitive Search index
    Input: Azure Cognitive Search index
    Output: None
    """
    start = time.time()
    search_client = SearchIndexClient(
        endpoint=acs_endpoint, credential=AzureKeyCredential(acs_key)
    )
    
    try:
        print("Deleting the Azure Cognitive Search index:", index_name)
        search_client.delete_index(index_name)
        print("Done. Elapsed time:", round(time.time() - start, 2), "secs")
    except:
        print("Cannot delete index. Check the index name.")

In [25]:
def index_stats(index_name):
    """
    Get statistics about Azure Cognitive Search index
    Input: Azure Cognitive Search index
    Output: Get Azure Cognitive Search index stats
    """
    url = (
        acs_endpoint
        + "/indexes/"
        + index_name
        + "/stats?api-version=2021-04-30-Preview"
    )
    headers = {
        "Content-Type": "application/json",
        "api-key": acs_key,
    }
    response = requests.get(url, headers=headers)
    print("Azure Cognitive Search index status for:", index_name, "\n")

    if response.status_code == 200:
        res = response.json()
        print(json.dumps(res, indent=2))
        document_count = res["documentCount"]
        storage_size = res["storageSize"]

    else:
        print("Request failed with status code:", response.status_code)

    return document_count, storage_size

In [26]:
def index_status(index_name):
    """
    Azure Cognitive Search index status
    Input: Azure Cognitive Search index
    Output: Get Azure Cognitive Search index status
    """
    print("Azure Cognitive Search Index:", index_name, "\n")

    headers = {"Content-Type": "application/json", "api-key": acs_key}
    params = {"api-version": "2021-04-30-Preview"}
    index_status = requests.get(
        acs_endpoint + "/indexes/" + index_name, headers=headers, params=params
    )

    try:
        print(json.dumps((index_status.json()), indent=5))
    except:
        print("Request failed with status code:", response.status_code)

## 6. Creating an Azure Cognitive Search index

In [27]:
try:
    # Setting the Azure Cognitive Search client
    print("Setting the Azure Cognitive Search client")
    search_client = SearchIndexClient(
        endpoint=acs_endpoint,
        credential=AzureKeyCredential(acs_key)
    )
    print("Done. Azure Cognitive Search client defined.")
    print(search_client)

except:
    print("Request failed. Cannot create Azure Cognitive Search client:", acs_endpoint)

Setting the Azure Cognitive Search client
Done. Azure Cognitive Search client defined.
<azure.search.documents.indexes._search_index_client.SearchIndexClient object at 0x7f69d2b55990>


### Removing any existing index

In [28]:
delete_index(index_name)

Deleting the Azure Cognitive Search index: surveys
Done. Elapsed time: 0.66 secs


### Creating search index

In [29]:
vector_search_dim = len(openai_text_embeddings("Hello"))
print("Vector embeddings size =", vector_search_dim)

Vector embeddings size = 1536


In [30]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=acs_endpoint, credential=AzureKeyCredential(acs_key)
)
fields = [
    # Index
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        sortable=True,
        filterable=True,
        facetable=True,
    ),
    # Searchable fields
    SearchableField(name="company", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="month", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="year", type=SearchFieldDataType.Single, filterable=True),
    SearchableField(name="comments", type=SearchFieldDataType.String),
    SearchableField(name="text", type=SearchFieldDataType.String),

    # Vectors embeddings
    SearchField(
        name="embed_text",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=vector_search_dim,
        vector_search_configuration="my-vector-config",
    ),


]


# Configuration
vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            # HNSW is a graph-based Approximate Nearest Neighbors (ANN)
            # algorithm optimized for high-recall, low-latency applications
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine",  # Cosine similarity metric
            },
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="comments"),
        prioritized_keywords_fields=[SemanticField(field_name="month")],
        prioritized_content_fields=[SemanticField(field_name="year")],
    ),
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(
    name=index_name,
    fields=fields,
    vector_search=vector_search,
    semantic_settings=semantic_settings,
)

try:
    result = index_client.create_or_update_index(index)
    print(f"Done. The {result.name} Azure Cognitive Search index has been created!")

except:
    print(f"Error. The {result.name} Azure Cognitive Search index cannot be created.")

Done. The surveys Azure Cognitive Search index has been created!


## 7. Uploading the documents into the index

In [31]:
print("Number of documents to load =", len(documents))

Number of documents to load = 1060


In [32]:
def upload_documents(docs):
    """
    Uploading documents into the Azure Cognitive Search index
    Inputs: documents
    Outputs: loading documents to Azure Cognitive Search index
    """
    search_client = SearchClient(
        endpoint=acs_endpoint,
        index_name=index_name,
        credential=AzureKeyCredential(acs_key),
    )
    result = search_client.upload_documents(docs)

In [33]:
def chunk_list(input_list, chunk_size):
    """
    Chunk a list according to the chunk_size value
    Inputs: documents (list), chunk size list
    Outputs: chunk list of documents
    """
    return [
        input_list[i : i + chunk_size] for i in range(0, len(input_list), chunk_size)
    ]

In [34]:
start = time.time()

chunk_size = 100  # We will load documents chunk by chunk
chunks = chunk_list(documents, chunk_size)
idx = 1

print("Loading the documents into the Azure Cognitive Search index...")
print("Total number of documents to load =", len(documents))
print()

loaded_docs = chunk_size

for chunk in chunks:
    pct_done = round(loaded_docs / len(documents) * 100)
    if pct_done >= 100:
        pct_done = 100

    print(
        f"Processing chunk {idx:03}",
        f"| Number of loaded documents = {loaded_docs:06}",
        "of",
        len(documents),
        "| Done:",
        pct_done,
        "%",
    )
    upload_documents(chunk)
    loaded_docs += chunk_size
    idx += 1

elapsed = time.time() - start
print("\nDone")
print(
    "Elapsed time: "
    + time.strftime(
        "%H:%M:%S.{}".format(str(elapsed % 1)[2:])[:15], time.gmtime(elapsed)
    )
)

Loading the documents into the Azure Cognitive Search index...
Total number of documents to load = 1060

Processing chunk 001 | Number of loaded documents = 000100 of 1060 | Done: 9 %
Processing chunk 002 | Number of loaded documents = 000200 of 1060 | Done: 19 %
Processing chunk 003 | Number of loaded documents = 000300 of 1060 | Done: 28 %
Processing chunk 004 | Number of loaded documents = 000400 of 1060 | Done: 38 %
Processing chunk 005 | Number of loaded documents = 000500 of 1060 | Done: 47 %
Processing chunk 006 | Number of loaded documents = 000600 of 1060 | Done: 57 %
Processing chunk 007 | Number of loaded documents = 000700 of 1060 | Done: 66 %
Processing chunk 008 | Number of loaded documents = 000800 of 1060 | Done: 75 %
Processing chunk 009 | Number of loaded documents = 000900 of 1060 | Done: 85 %
Processing chunk 010 | Number of loaded documents = 001000 of 1060 | Done: 94 %
Processing chunk 011 | Number of loaded documents = 001100 of 1060 | Done: 100 %

Done
Elapsed t

In [35]:
print(f"Elapsed time to process {len(documents)} documents = {round(elapsed)} seconds")
print(f"Time per processed document in second = {round(elapsed / len(documents), 5)}")
print(f"Number of processed documents per second = {int(len(documents) / elapsed)}")

Elapsed time to process 1060 documents = 21 seconds
Time per processed document in second = 0.01941
Number of processed documents per second = 51


## 8. Azure Cognitive Search Index informations

In [36]:
index_name

'surveys'

In [37]:
index_status(index_name)

Azure Cognitive Search Index: surveys 

{
     "@odata.context": "https://azurecogsearcheastussr.search.windows.net/$metadata#indexes/$entity",
     "@odata.etag": "\"0x8DBAFC1E451A7AF\"",
     "name": "surveys",
     "defaultScoringProfile": null,
     "fields": [
          {
               "name": "id",
               "type": "Edm.String",
               "searchable": false,
               "filterable": true,
               "retrievable": true,
               "sortable": true,
               "facetable": true,
               "key": true,
               "indexAnalyzer": null,
               "searchAnalyzer": null,
               "analyzer": null,
               "normalizer": null,
               "synonymMaps": []
          },
          {
               "name": "company",
               "type": "Edm.String",
               "searchable": true,
               "filterable": true,
               "retrievable": true,
               "sortable": false,
               "facetable": false,
     

In [38]:
document_count, storage_size = index_stats(index_name)

Azure Cognitive Search index status for: surveys 

{
  "@odata.context": "https://azurecogsearcheastussr.search.windows.net/$metadata#Microsoft.Azure.Search.V2021_04_30_Preview.IndexStatistics",
  "documentCount": 0,
  "storageSize": 0
}


In [39]:
print("Number of documents in the index =", f"{document_count:,}")
print("Size of the index =", round(storage_size / (1024 * 1024), 2), "MB")

Number of documents in the index = 0
Size of the index = 0.0 MB


Note: Please wait some time in order to have the updated results

> Go to the next notebook