In [None]:
! pip install azure-search-documents==11.6.0b12 --quiet
! pip install azure-identity --quiet
! pip install python-dotenv --quiet
! pip install pandas --quiet

In [2]:
import os
import dotenv
dotenv.load_dotenv()
search_endpoint = os.getenv("SEARCH_ENDPOINT")
admin_key= os.getenv("ADMIN_KEY")
index_name = os.getenv("INDEX_NAME")

In [3]:

from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex, 
    SimpleField, 
    SearchableField,
    ScoringProfile,
    TextWeights
)
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
import pandas as pd


In [4]:

df = pd.read_csv("azure_ai_search_sample_data.csv")
df = df.fillna("")

In [5]:
from azure.search.documents.indexes.models import (
    ScoringProfile, 
    TextWeights
)

index_client = SearchIndexClient(search_endpoint, AzureKeyCredential(admin_key))

fields = [
    SimpleField(name="referenceNumber", type="Edm.String", key=True),
    SearchableField(name="firstName", type="Edm.String", sortable=True, filterable=True),
    SearchableField(name="lastName", type="Edm.String", sortable=True, filterable=True),
    SearchableField(name="dateOfBirth", type="Edm.String", sortable=True, filterable=True),
    SearchableField(name="companyName", type="Edm.String", sortable=True, filterable=True),
    SearchableField(name="suffix", type="Edm.String", sortable=True, filterable=True),
    SearchableField(name="jointAccountNames", type="Edm.String", sortable=True, filterable=True),
]

# Create a scoring profile to boost name and date matches
scoring_profiles = [
    ScoringProfile(
        name="nameBoost",
        text_weights=TextWeights(
            weights={
                "firstName": 3.0,  # Boost firstName matches 3x
                "lastName": 3.0,   # Boost lastName matches 3x
                "dateOfBirth": 1.5,  # Boost date matches 1.5x
                "companyName": 2.0,  # Boost company matches 2x
                "jointAccountNames": 2.0  # Boost joint account matches 2x
            }
        )
    )
]

index = SearchIndex(
    name=index_name, 
    fields=fields,
    scoring_profiles=scoring_profiles,
    default_scoring_profile="nameBoost"  # Apply by default
)

# Delete the index if it exists (for demo repeatability)
try:
    index_client.delete_index(index_name)
except Exception:
    pass

index_client.create_index(index)
print("Index created with scoring profile 'nameBoost' (includes date boosting)")


Index created with scoring profile 'nameBoost' (includes date boosting)


In [6]:

from azure.search.documents import SearchClient

search_client = SearchClient(search_endpoint, index_name, AzureKeyCredential(admin_key))

# Prepare documents for upload
docs = []
for _, row in df.iterrows():
    doc = {
        "referenceNumber": row["Reference Number"] or f"REF{_+1000}",
        "firstName": row["First Name"],
        "lastName": row["Last Name"],
        "dateOfBirth": row["Date of Birth"],
        "companyName": row["Company Name"],
        "suffix": row["Suffix"],
        "jointAccountNames": row["Joint Account Names"],
    }
    docs.append(doc)

# Upload documents
batch_size = 1000
for i in range(0, len(docs), batch_size):
    batch = docs[i:i+batch_size]
    result = search_client.upload_documents(documents=batch)
    print(f"Uploaded batch {i//batch_size + 1}: {result}")


Uploaded batch 1: [<azure.search.documents._generated.models._models_py3.IndexingResult object at 0x0000028143DF2240>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x0000028143DF22A0>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x0000028143DF2270>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x0000028143DF22D0>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x0000028143DF2300>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x0000028143DF2330>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x0000028143DF2360>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x0000028143DF2390>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x0000028143DF23C0>, <azure.search.documents._generated.models._models_py3.IndexingResult object at 0x

## Normalized Score Calculation Summary

### Formula
```python
normalized_score = (raw_score / max_score) * 100

How It Works
Collect all results from Azure Search query
Find max score: Identify the highest raw BM25 score in the result set
Calculate for each result: Divide each raw score by max score, multiply by 100

Example
Query: "Jean-Pierre Dubois" returns:

Result 1: Raw 6.8630 → Normalized 100.00/100 (6.8630/6.8630 × 100)
Result 2: Raw 4.5650 → Normalized 66.52/100 (4.5650/6.8630 × 100)
Result 3: Raw 3.0000 → Normalized 43.71/100 (3.0000/6.8630 × 100)

Why Normalize?
Intuitive: 0-100 scale is easy to understand (100 = best match in this result set)
Consistent: Raw BM25 scores vary by query; normalized scores are always 0-100
Relative: Shows how much better one result is vs. others (90 vs 45 = 2x better)
User-friendly: Business users can interpret scores without understanding BM25

Key Points
⚠️ Scores are relative to current query results only - not comparable across different queries

⚠️ 100/100 means "best in this set", not "perfect match universally"

✅ We display both: Normalized (user-friendly) + Raw (technical validation)

In [7]:

def run_query(scenario_id, scenario_name, query, expected_score_range="", top=5, use_fuzzy=False):
    """
    Run a search query and display results with scoring
    
    Args:
        scenario_id: Scenario identifier (e.g., SV101)
        scenario_name: Description of the scenario
        query: Search query string
        expected_score_range: Expected score range for validation
        top: Number of results to return
        use_fuzzy: Enable fuzzy search for typo tolerance
    """
    print(f"\n{'='*80}")
    print(f"Scenario: {scenario_id} - {scenario_name}")
    print(f"Query: '{query}'")
    if expected_score_range:
        print(f"Expected Score Range: {expected_score_range}")
    if use_fuzzy:
        print(f"Fuzzy Search: Enabled")
    print(f"{'-'*80}")
    
    try:
        # Build search query with fuzzy matching if enabled
        if use_fuzzy:
            # For date queries, extract individual components and search for them
            # This allows partial matches on date components (year, month, day)
            terms = query.replace('.', ' ').split()
            # Search for each non-empty term individually
            search_terms = [term for term in terms if term]
            if search_terms:
                search_query = ' OR '.join(search_terms)
            else:
                search_query = query
        else:
            search_query = query
            
        results = search_client.search(
            search_text=search_query,
            top=top,
            include_total_count=True,
            query_type='full'  # Enable full Lucene query syntax
        )
        
        # Collect all results first to find max score
        results_list = list(results)
        
        if not results_list:
            print("No results found.")
            return
        
        # Find max score for normalization
        max_score = max(result.get('@search.score', 0) for result in results_list)
        
        result_count = 0
        for result in results_list:
            result_count += 1
            raw_score = result.get('@search.score', 0)
            
            # Normalize score to 0-100 range
            normalized_score = (raw_score / max_score * 100) if max_score > 0 else 0
            
            ref = result.get('referenceNumber', 'N/A')
            first = result.get('firstName', '')
            last = result.get('lastName', '')
            dob = result.get('dateOfBirth', '')
            company = result.get('companyName', '')
            suffix = result.get('suffix', '')
            joint = result.get('jointAccountNames', '')
            
            print(f"\nResult {result_count}:")
            print(f"  Normalized Score: {normalized_score:.2f}/100")
            print(f"  Raw Score: {raw_score:.4f}")
            print(f"  Reference: {ref}")
            if first or last:
                print(f"  Name: {first} {last}".strip())
            if dob:
                print(f"  DOB: {dob}")
            if company:
                print(f"  Company: {company} {suffix}".strip())
            if joint:
                print(f"  Joint Account: {joint}")
        
        print(f"\n{'-'*80}")
        print(f"Total results returned: {result_count}")
        print(f"Max raw score: {max_score:.4f}")
            
    except Exception as e:
        print(f"Error executing query: {e}")


In [8]:
# SV101: Exact Match
run_query("SV101", "Exact Match", "John Smith", "100")


Scenario: SV101 - Exact Match
Query: 'John Smith'
Expected Score Range: 100
--------------------------------------------------------------------------------

Result 1:
  Normalized Score: 100.00/100
  Raw Score: 1.7261
  Reference: REF001
Name: John Smith
  DOB: 15.06.1985

Result 2:
  Normalized Score: 94.00/100
  Raw Score: 1.6225
  Reference: REF002
Name: John Michael David Smith
  DOB: 22.09.1978

--------------------------------------------------------------------------------
Total results returned: 2
Max raw score: 1.7261


In [9]:
# SV102: Minor Variation in First Name
run_query("SV102", "Minor Variation in First Name", "Jon Smith", "95-100")


Scenario: SV102 - Minor Variation in First Name
Query: 'Jon Smith'
Expected Score Range: 95-100
--------------------------------------------------------------------------------

Result 1:
  Normalized Score: 100.00/100
  Raw Score: 0.8630
  Reference: REF002
Name: John Michael David Smith
  DOB: 22.09.1978

Result 2:
  Normalized Score: 100.00/100
  Raw Score: 0.8630
  Reference: REF001
Name: John Smith
  DOB: 15.06.1985

--------------------------------------------------------------------------------
Total results returned: 2
Max raw score: 0.8630


In [10]:
# SV103: Minor Typo in Last Name
run_query("SV103", "Minor Typo in Last Name", "John Smit", "85-90")


Scenario: SV103 - Minor Typo in Last Name
Query: 'John Smit'
Expected Score Range: 85-90
--------------------------------------------------------------------------------

Result 1:
  Normalized Score: 100.00/100
  Raw Score: 0.8630
  Reference: REF001
Name: John Smith
  DOB: 15.06.1985

Result 2:
  Normalized Score: 88.00/100
  Raw Score: 0.7595
  Reference: REF002
Name: John Michael David Smith
  DOB: 22.09.1978

--------------------------------------------------------------------------------
Total results returned: 2
Max raw score: 0.8630


In [None]:
# SV104: Transposed First and Last Name
run_query("SV104", "Transposed First and Last Name", "Smith John", "95-100")

In [None]:
# SV105: Missing Last Name
run_query("SV105", "Missing Last Name", "John Michael David", "60-70")

In [None]:
# SV106.A: Complex Name - Partial Match (Missing One Name)
run_query("SV106.A", "Complex Name: Missing One Name", "John David", "60-70")

In [11]:
# SV106.B: Complex Name - Partial Match (Missing Two Names)
run_query("SV106.B", "Complex Name: Missing Two Names", "David", "60-70")


Scenario: SV106.B - Complex Name: Missing Two Names
Query: 'David'
Expected Score Range: 60-70
--------------------------------------------------------------------------------

Result 1:
  Normalized Score: 100.00/100
  Raw Score: 0.7595
  Reference: REF002
Name: John Michael David Smith
  DOB: 22.09.1978

--------------------------------------------------------------------------------
Total results returned: 1
Max raw score: 0.7595


In [None]:
# SV106: Missing First Name
run_query("SV106", "Missing First Name", "Smith", "60-70")

In [12]:
# SV107: Transposed Date Elements
# Search combines name + transposed date format to demonstrate multi-field matching
run_query("SV107", "Transposed Date Elements", "John Smith 06.15.1985", "85-90", use_fuzzy=True)


Scenario: SV107 - Transposed Date Elements
Query: 'John Smith 06.15.1985'
Expected Score Range: 85-90
Fuzzy Search: Enabled
--------------------------------------------------------------------------------

Result 1:
  Normalized Score: 100.00/100
  Raw Score: 1.7261
  Reference: REF001
Name: John Smith
  DOB: 15.06.1985

Result 2:
  Normalized Score: 94.00/100
  Raw Score: 1.6225
  Reference: REF002
Name: John Michael David Smith
  DOB: 22.09.1978

--------------------------------------------------------------------------------
Total results returned: 2
Max raw score: 1.7261


In [None]:
# SV108: Lowercase Name
run_query("SV108", "Lowercase Name", "john smith", "100")

In [None]:
# SV109: Uppercase Name
run_query("SV109", "Uppercase Name", "JOHN SMITH", "100")

In [13]:
# SV110: Extra Punctuation
run_query("SV110", "Extra Punctuation", "John.", "95-100")


Scenario: SV110 - Extra Punctuation
Query: 'John.'
Expected Score Range: 95-100
--------------------------------------------------------------------------------

Result 1:
  Normalized Score: 100.00/100
  Raw Score: 0.8630
  Reference: REF001
Name: John Smith
  DOB: 15.06.1985

Result 2:
  Normalized Score: 88.00/100
  Raw Score: 0.7595
  Reference: REF002
Name: John Michael David Smith
  DOB: 22.09.1978

--------------------------------------------------------------------------------
Total results returned: 2
Max raw score: 0.8630


In [14]:
# SV111.A: Abbreviated First Name
run_query("SV111.A", "Abbreviated First Name", "J. Smith", "85-90")


Scenario: SV111.A - Abbreviated First Name
Query: 'J. Smith'
Expected Score Range: 85-90
--------------------------------------------------------------------------------

Result 1:
  Normalized Score: 100.00/100
  Raw Score: 0.8630
  Reference: REF002
Name: John Michael David Smith
  DOB: 22.09.1978

Result 2:
  Normalized Score: 100.00/100
  Raw Score: 0.8630
  Reference: REF001
Name: John Smith
  DOB: 15.06.1985

--------------------------------------------------------------------------------
Total results returned: 2
Max raw score: 0.8630


In [15]:
# SV111.B: First Name Abbreviation (Initials)
# Searching for just initials with wildcards: J* P* to match Jean* Pierre*
run_query("SV111.B", "First Name Abbreviation", "J* P*", "85-90")


Scenario: SV111.B - First Name Abbreviation
Query: 'J* P*'
Expected Score Range: 85-90
--------------------------------------------------------------------------------

Result 1:
  Normalized Score: 100.00/100
  Raw Score: 6.0000
  Reference: REF003
Name: Jean-Pierre Dubois
  DOB: 06.07.1990

Result 2:
  Normalized Score: 66.67/100
  Raw Score: 4.0000
  Reference: REF203
  Joint Account: Jean-Pierre Dubois & Maria Schmidt

Result 3:
  Normalized Score: 50.00/100
  Raw Score: 3.0000
  Reference: REF002
Name: John Michael David Smith
  DOB: 22.09.1978

Result 4:
  Normalized Score: 50.00/100
  Raw Score: 3.0000
  Reference: REF007
Name: Anna Johnson
  DOB: 17.12.1992

Result 5:
  Normalized Score: 50.00/100
  Raw Score: 3.0000
  Reference: REF001
Name: John Smith
  DOB: 15.06.1985

--------------------------------------------------------------------------------
Total results returned: 5
Max raw score: 6.0000


In [None]:
# SV114: Missing Last Name
run_query("SV114", "Missing Last Name", "John", "65-70")

In [None]:
# SV115: Missing First Name
run_query("SV115", "Missing First Name", "Smith", "65-70")

In [16]:
# SV117: Missing Date of Birth
run_query("SV117", "Missing Date of Birth", "John Smith", "70-75")


Scenario: SV117 - Missing Date of Birth
Query: 'John Smith'
Expected Score Range: 70-75
--------------------------------------------------------------------------------

Result 1:
  Normalized Score: 100.00/100
  Raw Score: 1.7261
  Reference: REF001
Name: John Smith
  DOB: 15.06.1985

Result 2:
  Normalized Score: 94.00/100
  Raw Score: 1.6225
  Reference: REF002
Name: John Michael David Smith
  DOB: 22.09.1978

--------------------------------------------------------------------------------
Total results returned: 2
Max raw score: 1.7261


The key insight: **Date-only searches need exact matching via filters, not text search.** 
e.g., 
# Use OData filter for exact date matching
results = search_client.search(
    search_text="*",  # Match all
    filter="dateOfBirth eq '15.06.1985'",  # Exact date filter
    top=5
)
Fuzzy text search on dates alone creates too much noise without a name to anchor the results.

In [17]:
# SV118: Partial Date (Missing Year)
# Search combines name + partial date (missing year) to find records
run_query("SV118", "Partial Date (Missing Year)", "John Smith 15.06.", "70-80", use_fuzzy=True)


Scenario: SV118 - Partial Date (Missing Year)
Query: 'John Smith 15.06.'
Expected Score Range: 70-80
Fuzzy Search: Enabled
--------------------------------------------------------------------------------

Result 1:
  Normalized Score: 100.00/100
  Raw Score: 1.7261
  Reference: REF001
Name: John Smith
  DOB: 15.06.1985

Result 2:
  Normalized Score: 94.00/100
  Raw Score: 1.6225
  Reference: REF002
Name: John Michael David Smith
  DOB: 22.09.1978

--------------------------------------------------------------------------------
Total results returned: 2
Max raw score: 1.7261


In [None]:
# SV119: Partial Date (Missing Day)
# Search combines name + partial date (missing day) to find records
run_query("SV119", "Partial Date (Missing Day)", "John Smith .06.1985", "75-80", use_fuzzy=True)

In [None]:
# SV120: Partial Date (Missing Month)
# Search combines name + partial date (missing month) to find records
run_query("SV120", "Partial Date (Missing Month)", "John Smith 15..1985", "75-80", use_fuzzy=True)

In [20]:
# SV121: Typo + Missing Umlaut
# Using fuzzy search to match Muller with Müller (umlaut handling)
run_query("SV121", "Typo + Missing Umlaut", "Mueller~", "90-95")


Scenario: SV121 - Typo + Missing Umlaut
Query: 'Mueller~'
Expected Score Range: 90-95
--------------------------------------------------------------------------------

Result 1:
  Normalized Score: 100.00/100
  Raw Score: 0.5754
  Reference: REF004
Name: Hans Müller
  DOB: 12.03.1982

Result 2:
  Normalized Score: 66.67/100
  Raw Score: 0.3836
  Reference: REF204
  Joint Account: Robert Williams & Hans Müller

--------------------------------------------------------------------------------
Total results returned: 2
Max raw score: 0.5754


In [None]:
# SV122: Company Record
run_query("SV122", "Company Record", "TechCorp AG", "100")

In [None]:
# SV123: Missing Suffix
run_query("SV123", "Missing Suffix", "TechCorp", "100")

In [None]:
# SV124: Expanded Suffix
run_query("SV124", "Expanded Suffix", "TechCorp Aktiengesellschaft", "100")

In [21]:
# SV125: Abbreviated Suffix with Punctuation
run_query("SV125", "Abbreviated Suffix with Punctuation", "TechCorp A.G.", "100")


Scenario: SV125 - Abbreviated Suffix with Punctuation
Query: 'TechCorp A.G.'
Expected Score Range: 100
--------------------------------------------------------------------------------

Result 1:
  Normalized Score: 100.00/100
  Raw Score: 0.5754
  Reference: REF101
Company: TechCorp AG

--------------------------------------------------------------------------------
Total results returned: 1
Max raw score: 0.5754


In [None]:
# SV126: Partnership Record
run_query("SV126", "Partnership Record", "Marta Schwarz & Peter Meier", "100")

In [None]:
# SV127: Abbreviated Partnership Name
run_query("SV127", "Abbreviated Partnership Name", "M Schwarz & P Meier", "85-90")

In [None]:
# SV128: Partial Partnership Name
run_query("SV128", "Partial Partnership Name", "Marta Schwarz und Meier", "80-85")

In [None]:
# SV129: Joint Account Name Transposition
run_query("SV129", "Joint Account Name Transposition", "Peter Meier und Marta Schwarz", "90-95")

In [None]:
# SV130: Name Appears in Joint Account
run_query("SV130", "Name Appears in Joint Account", "Peter Meier", "75-80")