In [1]:
import requests
import json
from openai import AzureOpenAI
import os

"""
IN CMD:

setx OPENAI_KEY_ABB ""
setx OPENAI_BASE_ABB ""

setx AISEARCH_KEY_ABB ""
setx AISEARCH_BASE_ABB ""

setx DOCUMENT_INTELLIGENCE_KEY_ABB ""
setx DOCUMENT_INTELLIGENCE_BASE_ABB ""
"""

openai_api_version = "2024-03-01-preview"
openai_key = os.environ.get("OPENAI_KEY_ABB")
openai_endpoint = os.environ.get("OPENAI_BASE_ABB")

index_name = "ai_workshop"
aisearch_key = os.environ.get("AISEARCH_KEY_ABB")
aisearch_endpoint = os.environ.get("AISEARCH_BASE_ABB")

document_intelligence_key = os.environ.get("DOCUMENT_INTELLIGENCE_KEY_ABB")
document_intelligence_base = os.environ.get("DOCUMENT_INTELLIGENCE_BASE_ABB")

client = AzureOpenAI(
  api_version = openai_api_version,
  api_key = openai_key, 
  azure_endpoint = openai_endpoint
)

### Load data

In [2]:
import csv

# Initialize an empty dictionary to store heading and content
heading_content_dict = {}

# Define CSV file path
csv_file_path = "heading_content.csv"

# Read data from CSV file
with open(csv_file_path, "r", newline="", encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    
    # Skip header row
    next(reader)
    
    # Read data rows
    for row in reader:
        heading = row[0]
        content = row[1]
        heading_content_dict[heading] = content

# Print or process the rebuilt dictionary
for heading, content in heading_content_dict.items():
    print("Heading:", heading)
    print("Content:", content)
    print()

Heading: 1\. Lecture Information
Content: 
| Lecture Topics | Lecture Duration: 2.30 Hours |
| - | - |
| 1) Introduction to pumps types | Parts Demonstration: 30 Minutes |
| 2) Assembly parts of pumps | |
| 3) Pump operation | |
| 4) Preventive maintenance of pumps | |
| 5) Troubleshooting of pumps | |
| 6) Selection criteria of pumps | |

Liquids are typically moved by pumps. These use work to increase the mechanical energy of a fluid, which in turn can increase the flow rate (velocity), pressure, or elevation of the fluid.



Heading: Types of Pumps:
Content: 
There are two main categories of pumps -- positive displacement and centrifugal. The choice is based on the liquid to be pumped and the desired head and capacity.

Centrifugal pumps are probably most common in industrial applications. They may be built in a very large number of materials. Capacity ranges up to 6000 gpm are common, as are heads to 600 feet, all without special drivers. Performance drops off significantly when ha

### Creating the index

In [3]:
# Setup the Payloads header
headers = {'Content-Type': 'application/json','api-key': aisearch_key}
params = {'api-version': "2023-10-01-Preview"}

index_payload = {
    "name": index_name,
    "vectorSearch": {
        "algorithms": [  # We are showing here 3 types of search algorithms configurations that you can do
             {
                 "name": "my-hnsw-config",
                 "kind": "hnsw",
                 "hnswParameters": {
                     "m": 4,
                     "efConstruction": 400,
                     "efSearch": 500,
                     "metric": "cosine"
                 }
             },
             {
                 "name": "my-eknn-config",
                 "kind": "exhaustiveKnn",
                 "exhaustiveKnnParameters": {
                     "metric": "cosine"
                 }
             }
        ],
        "vectorizers": [
            {
                "name": "openai",
                "kind": "azureOpenAI",
                "azureOpenAIParameters":
                {
                    "resourceUri" : openai_endpoint,
                    "apiKey" : openai_key,
                    "deploymentId" : "text-embedding-ada-002"
                }
            }
        ],
        "profiles": [  # profiles is the diferent kind of combinations of algos and vectorizers
            {
             "name": "my-vector-profile-hnsw",
             "algorithm": "my-hnsw-config",
             "vectorizer":"openai"
            },
            {
             "name": "my-vector-profile-eknn",
             "algorithm": "my-eknn-config",
             "vectorizer":"openai"
            }
        ]
    },
    "semantic": {
        "configurations": [
            {
                "name": "my-semantic-config",
                "prioritizedFields": {
                    "titleField": {
                        "fieldName": "title"
                    },
                    "prioritizedContentFields": [
                        {
                            "fieldName": "content"
                        }
                    ],
                    "prioritizedKeywordsFields": [
                        {
                            "fieldName": "paragraph"
                        }
                    ]
                }
            }
        ]
    },
    "fields": [
        {"name": "id", "type": "Edm.String", "key": "true", "filterable": "true" },
        {"name": "title","type": "Edm.String","searchable": "true","retrievable": "true","filterable": "false","sortable": "false","facetable": "false"},
        {"name": "paragraph","type": "Edm.String","searchable": "true","retrievable": "true","filterable": "true","sortable": "false","facetable": "false"},
        {"name": "content","type": "Edm.String","searchable": "true","retrievable": "true","filterable": "false","sortable": "false","facetable": "false"},     
        {
            "name": "contentVector",
            "type": "Collection(Edm.Single)",
            "dimensions": 1536,
            "vectorSearchProfile": "my-vector-profile-hnsw", # we picked profile 3 to show that this index uses eKNN vs HNSW (on prior notebooks)
            "searchable": "true",
            "retrievable": "true",
            "filterable": "false",
            "sortable": "false",
            "facetable": "false" 
        }
        
    ],
}

r = requests.put(aisearch_endpoint + "/indexes/" + index_name,
                 data=json.dumps(index_payload), headers=headers, params=params)
print(r.status_code)
print(r.ok)

201
True


### Pushing cintent into index

In [5]:
import base64

def get_embedding(content):

    response = client.embeddings.create(
        input = content,
        model= "text-embedding-ada-002"
    )

    return response.data[0].embedding

for heading, content in heading_content_dict.items():
    try:
        upload_payload = {
                    "value": [
                        {
                            #"id": text_to_base64(product_name + paragraph), #fix when further splitting paragraphs
                            "id": base64.urlsafe_b64encode(bytes(heading, 'utf-8')).decode(), 
                            "title": "machinery manual sample",
                            "paragraph": heading,
                            "content": content,
                            "contentVector": get_embedding(content if content!="" else "-------"),
                            "@search.action": "upload"
                        },
                    ]
                }

        r = requests.post(aisearch_endpoint + "/indexes/" + index_name + "/docs/index",
                                     data=json.dumps(upload_payload), headers=headers, params=params)

        if r.status_code != 200:
            print(r.status_code)
            print(r.text)

    except Exception as e:
        print("Exception:",e)
        continue