# Azure AI Search集成矢量化示例
该代码演示了如何使用Azure AI Search作为向量存储，通过在Azure AI Search的技能集管道中使用AzureOpenAIEmbedding技能自动切块和生成嵌入。
## 先决条件
要运行代码，安装以下软件包。该示例当前使用的版本是`11.4.0b12`，这是一个预发布版本。请注意，集成矢量化功能处于预览状态，尚未发布到[azure-search-documents](https://pypi.org/project/azure-search-documents/#description)在pypi上。如果您想使用这个功能，请参考whl文件。我们希望能尽快发布更新版本!

In [1]:
! pip install ../whl/azure_search_documents-11.4.0b12-py3-none-any.whl --quiet  
! pip install openai azure-storage-blob python-dotenv --quiet

## 导入所需的库和环境变量

In [None]:
!pip install python-dotenv

In [25]:
import dotenv

dotenv.load_dotenv(override=True)

True

In [41]:
# Import required libraries  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient  
import os
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryLanguage,
    QueryType,
    RawVectorQuery,
    VectorizableTextQuery,
    VectorFilterMode,    
)
from azure.search.documents.indexes.models import (  
    AzureOpenAIEmbeddingSkill,  
    AzureOpenAIParameters,  
    AzureOpenAIVectorizer,  
    ExhaustiveKnnParameters,  
    ExhaustiveKnnVectorSearchAlgorithmConfiguration,
    FieldMapping,  
    HnswParameters,  
    HnswVectorSearchAlgorithmConfiguration,  
    IndexProjectionMode,  
    InputFieldMappingEntry,  
    OutputFieldMappingEntry,  
    PrioritizedFields,    
    SearchField,  
    SearchFieldDataType,  
    SearchIndex,  
    SearchIndexer,  
    SearchIndexerDataContainer,  
    SearchIndexerDataSourceConnection,  
    SearchIndexerIndexProjectionSelector,  
    SearchIndexerIndexProjections,  
    SearchIndexerIndexProjectionsParameters,  
    SearchIndexerSkillset,  
    SemanticConfiguration,  
    SemanticField,  
    SemanticSettings,  
    SplitSkill,  
    VectorSearch,  
    VectorSearchAlgorithmKind,  
    VectorSearchAlgorithmMetric,  
    VectorSearchProfile,  
)  
from azure.storage.blob import BlobServiceClient  
import openai  
from dotenv import load_dotenv  
import os  
  
# Configure environment variables  
load_dotenv()  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")  
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")  
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")  
model: str = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL")  
blob_connection_string = os.getenv("BLOB_CONNECTION_STRING")  
container_name = os.getenv("BLOB_CONTAINER_NAME")  
credential = AzureKeyCredential(key)  

In [42]:
# load_dotenv(dotenv_path=("../.env"), override=True)  
# index_name
model
# print(os.getenv("AZURE_SEARCH_INDEX_NAME"))

'emb002'

## 连接到Blob存储  
从Blob存储中检索文档。您可以使用[文档](../data/documents)文件夹中的示例文档。

In [5]:
# Connect to Blob Storage
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(container_name)
blobs = container_client.list_blobs()

first_blob = next(blobs)
blob_url = container_client.get_blob_client(first_blob).url
print(f"URL of the first blob: {blob_url}")

URL of the first blob: https://sa4rade.blob.core.windows.net/searchdemo/EmployeeHandbook_Chinese1107.pdf


## 将你的Blob存储连接到Azure AI Search中的数据源

In [31]:
# Create a data source 
ds_client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))
container = SearchIndexerDataContainer(name=container_name)
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-blob",
    type="azureblob",
    connection_string=blob_connection_string,
    container=container
)
data_source = ds_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

Data source 'gptkbindex-demo-blob' created or updated


## 创建搜索索引

In [59]:
# Create a search index  
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)  
fields = [  
    SearchField(name="parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),  
    SearchField(name="title", type=SearchFieldDataType.String, analyzer_name="zh-Hans.lucene"),  
    SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),  
    SearchField(name="chunk", type=SearchFieldDataType.String,  analyzer_name="zh-Hans.lucene", sortable=False, filterable=False, facetable=False),  
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile="myHnswProfile"),  
]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswVectorSearchAlgorithmConfiguration(  
            name="myHnsw",  
            kind=VectorSearchAlgorithmKind.HNSW,  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
        ExhaustiveKnnVectorSearchAlgorithmConfiguration(  
            name="myExhaustiveKnn",  
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,  
            parameters=ExhaustiveKnnParameters(  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm="myHnsw",  
            vectorizer="myOpenAI",  
        ),  
        VectorSearchProfile(  
            name="myExhaustiveKnnProfile",  
            algorithm="myExhaustiveKnn",  
            vectorizer="myOpenAI",  
        ),  
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=os.getenv("AZURE_OPENAI_ENDPOINT"),  
                deployment_id=model,  
                api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
            ),  
        ),  
    ],  
)  
  
semantic_config = SemanticConfiguration(  
    name="my-semantic-config",  
    prioritized_fields=PrioritizedFields(  
        prioritized_content_fields=[SemanticField(field_name="chunk")]  
    ),  
)  
  
# Create the semantic settings with the configuration  
semantic_settings = SemanticSettings(configurations=[semantic_config])  
  
# Create the search index with the semantic settings  
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_settings=semantic_settings)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created")  


gptkbindex-demo created


In [40]:
os.getenv("AZURE_OPENAI_API_KEY")

'fd4b706778944e8ea8775fa7d3ccefe8'

## 创建一套技能

In [60]:
# Create a skillset  
skillset_name = f"{index_name}-skillset"  
  
split_skill = SplitSkill(  
    description="Split skill to chunk documents",  
    text_split_mode="pages",  
    context="/document",  
    maximum_page_length=1024,  
    page_overlap_length=256,  
    default_language_code="zh-Hans",
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/content"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="textItems", target_name="pages")  
    ],  
)  
  
embedding_skill = AzureOpenAIEmbeddingSkill(  
    description="Skill to generate embeddings via Azure OpenAI",  
    context="/document/pages/*",  
    resource_uri=os.getenv("AZURE_OPENAI_ENDPOINT"),  
    deployment_id=model,  
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/pages/*"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="embedding", target_name="vector")  
    ],  
)  
  
index_projections = SearchIndexerIndexProjections(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            parent_key_field_name="parent_id",  
            source_context="/document/pages/*",  
            mappings=[  
                InputFieldMappingEntry(name="chunk", source="/document/pages/*"),  
                InputFieldMappingEntry(name="vector", source="/document/pages/*/vector"),  
                InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),  
            ],  
        ),  
    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
)  
  
skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to chunk documents and generating embeddings",  
    skills=[split_skill, embedding_skill],  
    index_projections=index_projections,  
)  
  
client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))  
client.create_or_update_skillset(skillset)  
print(f"{skillset.name} created")  


gptkbindex-demo-skillset created


## 创建一个索引器

In [62]:
# Create an indexer  
indexer_name = f"{index_name}-indexer"  
  
indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=index_name,  
    data_source_name=data_source.name,  
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results  
    field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")]  
)  
  
indexer_client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))  
indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f' {indexer_name} created')  


 gptkbindex-demo-indexer created


## 执行向量相似性搜索

这个示例展示了使用可向量化文本查询的纯向量搜索，你需要做的就是传入文本，你的向量化程序会处理查询向量化。

In [63]:
# Pure Vector Search
query = "员工有几天带薪年假"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k=1, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k=3, fields="vector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["parent_id", "chunk_id", "chunk"],
    top=1
)  
  
for result in results:  
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['chunk']}")   


parent_id: aHR0cHM6Ly9zYTRyYWRlLmJsb2IuY29yZS53aW5kb3dzLm5ldC9zZWFyY2hkZW1vL0VtcGxveWVlSGFuZGJvb2tfQ2hpbmVzZTExMDcucGRm0
chunk_id: 75033d34bb1c_aHR0cHM6Ly9zYTRyYWRlLmJsb2IuY29yZS53aW5kb3dzLm5ldC9zZWFyY2hkZW1vL0VtcGxveWVlSGFuZGJvb2tfQ2hpbmVzZTExMDcucGRm0_pages_7
Score: 0.8414016
Content: 54 
社会保险与员工福利 ........................................................................................................................................................ 55 
MS VACATION 系统的基本使用指导 ............................................................................................................................ 63 
如何在系统中提交休假支持材料.................................................................................................................................... 64 
如何提交离职申请............................................................................................................................................................. 65 
管理您的个人数据.............................................................

## 执行混合搜索

In [69]:
# Hybrid Search
query = "带薪年假"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k=1, fields="vector", exhaustive=True)
  
results = search_client.search(  
    search_text=query,  
    vector_queries= [vector_query],
    select=["parent_id", "chunk_id", "chunk"],
    top=5
)  
  
for result in results:  
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['chunk']}")  


parent_id: aHR0cHM6Ly9zYTRyYWRlLmJsb2IuY29yZS53aW5kb3dzLm5ldC9zZWFyY2hkZW1vL0VtcGxveWVlSGFuZGJvb2tfQ2hpbmVzZTExMDcucGRm0
chunk_id: 75033d34bb1c_aHR0cHM6Ly9zYTRyYWRlLmJsb2IuY29yZS53aW5kb3dzLm5ldC9zZWFyY2hkZW1vL0VtcGxveWVlSGFuZGJvb2tfQ2hpbmVzZTExMDcucGRm0_pages_7
Score: 0.0313725508749485
Content: 54 
社会保险与员工福利 ........................................................................................................................................................ 55 
MS VACATION 系统的基本使用指导 ............................................................................................................................ 63 
如何在系统中提交休假支持材料.................................................................................................................................... 64 
如何提交离职申请............................................................................................................................................................. 65 
管理您的个人数据....................................................

## 执行混合搜索 + 语义重新排名

In [67]:
# Semantic Hybrid Search
query = "带薪年假"

search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
vector_query = VectorizableTextQuery(text=query, k=2, fields="vector", exhaustive=True)

results = search_client.search(  
    search_text=query,
    vector_queries=[vector_query],
    select=["parent_id", "chunk_id", "chunk"],
    query_type=QueryType.SEMANTIC, query_language=QueryLanguage.EN_US, semantic_configuration_name='my-semantic-config', query_caption=QueryCaptionType.EXTRACTIVE, query_answer=QueryAnswerType.EXTRACTIVE,
    top=2
)

semantic_answers = results.get_answers()
for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Reranker Score: {result['@search.reranker_score']}")
    print(f"Content: {result['chunk']}")  

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")


parent_id: aHR0cHM6Ly9zYTRyYWRlLmJsb2IuY29yZS53aW5kb3dzLm5ldC9zZWFyY2hkZW1vL0VtcGxveWVlSGFuZGJvb2tfQ2hpbmVzZTExMDcucGRm0
chunk_id: 75033d34bb1c_aHR0cHM6Ly9zYTRyYWRlLmJsb2IuY29yZS53aW5kb3dzLm5ldC9zZWFyY2hkZW1vL0VtcGxveWVlSGFuZGJvb2tfQ2hpbmVzZTExMDcucGRm0_pages_53
Reranker Score: 2.366842746734619
Content: 个人过去的薪资福利信息等）、绩效考核、升职、调动、加薪、市场调查过程中提供任何虚

假或误导性的信息或故意遗漏重要信息；以及在公司招聘过程中进行的背景调查或反腐败

审查向公司提供不完整或虚假信息，包括但不限于在完成候选人信息披露表时提供不完整

或虚假信息 ；  

3.18. 领导/管理行为 

⚫ 主管和/或其他职能部门人员对善意举报人采取行动报复、恐吓举报人。 

3.19. 违反法律/公共规范 

• 使用公司所有或租赁的车辆过程中有以下严重违反中国《道路交通安全法》的行为： 

1. 无有效驾照而驾驶车辆； 

2. 醉酒驾驶或在酒精或药品（药品类别参照中国《道路交通安全法》确定）的影响下

驾驶； 

3. 使用伪造车牌或有意遮挡车牌； 

4. 发生事故后逃逸。 

• 因违反法律法规而被依法追究刑事责任的； 

• 在公司场所或在工作时间内或工作过程中，或在公司外但与公司相关的场所或公司活动中，

服用毒品、禁药，酗酒或赌博，或者提供毒品或禁药； 

• 组织或煽动停工、怠工或以暴力破坏公司财产等严重影响公司经营秩序的行为； 

• 在公司场所或在工作时间或在履行职务过程中，持有未授权枪支、弹药、国家规定的管制器

械。 

3.20. 其他违反公司价值观或需要加重处罚的严重违纪行为 

• 无正当理由连续旷工三（3）个工作日或以上，或 12 个月内累计旷工五（5）个工作日或以

上；为明确之目的，未按照公司政策和流程申请休假，或无正当理由在未获得休假批准的情

况下，擅自休假的，均视为旷工行为； 



 
 

27 

 微软 Confide