In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.WARN)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
import sys, os

print(f"Installing packages into environment {sys.executable}")

Installing packages into environment c:\anaconda3\envs\woodsidepoc\python.exe


In [3]:
!{sys.executable} -m pip install azure-search-documents==11.4.0b8
!{sys.executable} -m pip install azure-identity
!{sys.executable} -m pip install python-dotenv



In [4]:
from dotenv import load_dotenv
import os

load_dotenv()  # Load environment variables from .env file

True

In [5]:
from azure.core.credentials import AzureKeyCredential  
# Configure connection
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
key_credential = AzureKeyCredential(os.getenv('AZURE_SEARCH_ADMIN_KEY'))

In [6]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexerClient
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=key_credential)

indexer_client = SearchIndexerClient(endpoint=service_endpoint, credential=key_credential)

In [7]:
from azure.search.documents.indexes.models import ( 
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    SearchIndex,
    BM25Similarity,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
    #VectorSearchAlgorithmKind,
    HnswParameters,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SemanticSettings,
    #VectorSearchProfile
    OcrSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    MergeSkill,
    SplitSkill,
    ShaperSkill,
    SearchIndexerSkillset,
    CognitiveServicesAccountKey,
    SearchIndexerKnowledgeStore,
    SearchIndexerKnowledgeStoreProjection,
    SearchIndexerKnowledgeStoreObjectProjectionSelector,
    EntityRecognitionSkill,
    KeyPhraseExtractionSkill   
    
)

In [8]:
cognitive_services_account = CognitiveServicesAccountKey(description=os.getenv('AZURE_COGNITIVE_ACCOUNT_RESOURCE'), key=os.getenv('AZURE_COGNITIVE_ACCOUNT_KEY')) 

In [None]:
ocr_skill = OcrSkill(
    name="#1", 
    description="Extracts text (plain and structured) from image.",
    context="/document/normalized_images/*",
    line_ending="Space",
    default_language_code="en",
    should_detect_orientation=True,
    inputs=[
        InputFieldMappingEntry(name="image", source="/document/normalized_images/*")
    ],
    outputs=[
        OutputFieldMappingEntry(name="text", target_name="text")
    ]
)

merge_skill = MergeSkill(
    name="#2",
    description="Create merged_content, which includes all the textual representation of each image inserted at the right location in the content field.",
    context="/document",
    insert_pre_tag="--IMG--",
    insert_post_tag="--IMG--",
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/content"),
        InputFieldMappingEntry(name="itemsToInsert", source="/document/normalized_images/*/text"),
        InputFieldMappingEntry(name="offsets", source="/document/normalized_images/*/contentOffset")
    ],
    outputs=[
        OutputFieldMappingEntry(name="mergedText", target_name="mergedText")
    ]
)


split_skill = SplitSkill(
    name="#3",
    context="/document",
    default_language_code="en",
    text_split_mode="pages",
    maximum_page_length=4000,
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/mergedText")
    ],
    outputs=[
        OutputFieldMappingEntry(name="textItems", target_name="textItems")
    ]
)

shaper_skill = ShaperSkill(
    name="#4",
    context="/document",
    inputs=[
        InputFieldMappingEntry(
            name="textItem", 
            source_context="/document/textItems/*",
            inputs=[
                InputFieldMappingEntry(name="metadata_spo_item_content_type", source="/document/metadata_storage_content_type"),
                InputFieldMappingEntry(name="metadata_spo_item_last_modified", source="/document/metadata_storage_last_modified"),
                InputFieldMappingEntry(name="metadata_spo_item_size", source="/document/metadata_storage_size"),
                InputFieldMappingEntry(name="metadata_spo_item_name", source="/document/metadata_storage_name"),
                InputFieldMappingEntry(name="metadata_spo_item_weburi", source="/document/metadata_storage_path"),
                InputFieldMappingEntry(name="page", source="/document/textItems/*")
            ] 
        )
    ],
    outputs=[
        OutputFieldMappingEntry(name="output", target_name="ShaperOutput")
    ]
)


In [None]:
knowledge_store = SearchIndexerKnowledgeStore(
    storage_connection_string=os.getenv("AZURE_KNOWLEDGE_STORE_RESOURCE"),
    identity=None,
    projections=[
        SearchIndexerKnowledgeStoreProjection(
            tables=[],
            objects=[
                SearchIndexerKnowledgeStoreObjectProjectionSelector(
                    storage_container="getest2",
                    reference_key_name=None,
                    generated_key_name="export",
                    source=None,
                    source_context="/document",
                    inputs=[
                        InputFieldMappingEntry(name="pages", source="/document/ShaperOutput", source_context=None, inputs=[]),
                    ]
                )
            ],
            files=[]
        )    
    ],
)



In [None]:
skillset_name="skillset-sharepoint-demo-index-51"
description="Skillset to split the document into pages and project to Azure Storage."
skillset_sharepoint_demo_index_51 = SearchIndexerSkillset(name=skillset_name, description=description, skills=[ocr_skill, merge_skill, split_skill, shaper_skill],
                                                          cognitive_services_account=cognitive_services_account, knowledge_store=knowledge_store)



In [None]:
indexer_client.create_skillset(skillset=skillset_sharepoint_demo_index_51)

# skillset-sharepoint-demo-index-53

In [24]:
entity_recognition_skill = EntityRecognitionSkill(
    name="#1",
    description="Recognize organizations, persons and locations",
    context="/documentz/pagez",
    categories=["Organization", "Person", "Location"],
    default_language_code="en",
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/page")
    ],
    outputs=[
        OutputFieldMappingEntry(name="organizations", target_name="organizations"),
        OutputFieldMappingEntry(name="persons", target_name="persons"),
        OutputFieldMappingEntry(name="locations", target_name="locations")
    ]
)

key_phrase_extraction_skill=KeyPhraseExtractionSkill(
    name="#2",
    context="/document/page",
    default_language_code="en",
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/page"),
        InputFieldMappingEntry(name="languageCode", source="/document/language")
    ],
    outputs=[
        OutputFieldMappingEntry(name="keyPhrases", target_name="keyPhrases")
    ]
)



In [25]:
skillset_name="skillset-sharepoint-demo-index-53"
description="Built-in AI skills."
skillset_sharepoint_demo_index_53 = SearchIndexerSkillset(name=skillset_name, description=description, skills=[entity_recognition_skill, key_phrase_extraction_skill],
                                                          cognitive_services_account=cognitive_services_account)

In [26]:
indexer_client.create_skillset(skillset=skillset_sharepoint_demo_index_53)

<azure.search.documents.indexes.models._models.SearchIndexerSkillset at 0x296b7a60a10>