 Accept and Normalize User Input – focusing on:

Cleaning input

Lowercasing

Removing extra whitespace

Removing unnecessary punctuation
(without expanding abbreviations like PED)



In [1]:
import re

def normalize_user_query(query: str) -> str:
    """
    Cleans and normalizes a user query without expanding abbreviations.
    
    Args:
        query (str): Raw user input.
        
    Returns:
        str: Normalized query.
    """
    # 1. Convert to lowercase
    query = query.lower()

    # 2. Remove unnecessary punctuation (preserve medical terms like NCD, PED)
    query = re.sub(r"[^\w\s\-\/()]", "", query)

    # 3. Collapse multiple whitespaces
    query = re.sub(r"\s+", " ", query).strip()

    return query


# 🔎 Example usage
raw_query = "What is the grace period for premium payment under the National Parivar Mediclaim Plus Policy?"
normalized_query = normalize_user_query(raw_query)

print("Normalized Query:", normalized_query)


Normalized Query: what is the grace period for premium payment under the national parivar mediclaim plus policy



 Query Classification & Intent Recognition.


In [5]:
import re
from typing import Dict, Tuple

# Step 3.1 — Normalization function (same as before)
def normalize_user_query(query: str) -> str:
    query = query.lower()
    query = re.sub(r"[^\w\s\-\/()]", "", query)
    query = re.sub(r"\s+", " ", query).strip()
    return query

# Step 3.2 — Category keywords
CATEGORY_KEYWORDS = {
    "Coverage": ["cover", "coverage", "included", "include", "insured", "covered"],
    "Waiting Period": ["waiting period", "how long", "after how many", "time before", "initial waiting"],
    "Eligibility": ["eligibility", "who is eligible", "criteria", "conditions to apply"],
    "Limits": ["limit", "maximum", "cap", "restricted", "sub-limit", "restriction"],
    "Definition": ["define", "definition", "what is"],
    "Discounts": ["no claim", "ncd", "discount", "bonus"],
    "Benefits": ["benefit", "advantage", "reward", "perk", "preventive"],
    "Hospitalization": ["hospital", "icu", "room rent", "admission"],
    "AYUSH": ["ayurveda", "homeopathy", "ayush", "unani", "naturopathy", "siddha"],
    "Maternity": ["maternity", "childbirth", "pregnancy", "delivery", "termination"],
}

# Step 3.2 — Classification function
def classify_query(query: str, category_keywords: Dict[str, list]) -> Tuple[str, list]:
    matched_keywords = []
    matched_category = "Unknown"

    for category, keywords in category_keywords.items():
        for kw in keywords:
            if re.search(rf"\b{re.escape(kw)}\b", query, re.IGNORECASE):
                matched_keywords.append(kw)
                matched_category = category
                break
        if matched_category != "Unknown":
            break

    return matched_category, matched_keywords

# 🔎 Test Example
raw_query = "What is the waiting period for cataract surgery?"
normalized = normalize_user_query(raw_query)
category, entities = classify_query(normalized, CATEGORY_KEYWORDS)

print("Normalized Query:", normalized)
print("Query Category:", category)
print("Matched Keywords:", entities)


Normalized Query: what is the waiting period for cataract surgery
Query Category: Waiting Period
Matched Keywords: ['waiting period']


Step 3.3 – Canonicalization & Synonym Mapping



In [6]:
import re

# Step 3.3 — Canonical mapping dictionary
SYNONYM_MAP = {
    "eye operation": "cataract surgery",
    "eye surgery": "cataract surgery",
    "no claim bonus": "no claim discount",
    "bonus": "no claim discount",
    "preventive checkup": "preventive health check-up",
    "checkup": "health check-up",
    "pregnancy cost": "maternity expenses",
    "delivery cost": "maternity expenses",
    "hospital definition": "hospital",
    "room charges": "room rent",
    "icu charges": "icu charges",  # could be left as is, if it's already canonical
    "ncd": "no claim discount",
    "ped": "pre-existing disease"
}

def canonicalize_query(query: str, synonym_map: dict) -> str:
    """
    Replace terms in the query with their canonical equivalents using a mapping.
    """
    for phrase, canonical in synonym_map.items():
        pattern = rf"\b{re.escape(phrase)}\b"
        query = re.sub(pattern, canonical, query, flags=re.IGNORECASE)
    return query


# 🔎 Example usage
raw_query = "Is there any bonus if I don’t make a claim?"
from_step3_1 = normalize_user_query(raw_query)
canonical_query = canonicalize_query(from_step3_1, SYNONYM_MAP)

print("Original Query:", raw_query)
print("Normalized:", from_step3_1)
print("Canonicalized:", canonical_query)


Original Query: Is there any bonus if I don’t make a claim?
Normalized: is there any bonus if i dont make a claim
Canonicalized: is there any no claim discount if i dont make a claim


In [10]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load embedding model (same as used in Phase 2)
model = SentenceTransformer('BAAI/bge-base-en-v1.5')  # or your selected model

def get_query_embedding(canonical_query: str) -> np.ndarray:
    """
    Convert the canonicalized query to a dense vector using the embedding model.
    
    Args:
        canonical_query (str): Preprocessed query string.
    
    Returns:
        np.ndarray: Embedding vector for semantic search.
    """
    embedding = model.encode(canonical_query, normalize_embeddings=True)
    return embedding


# 🧪 Example Usage
raw_query = "Does the plan include eye operation?"
normalized = normalize_user_query(raw_query)
canonical = canonicalize_query(normalized, SYNONYM_MAP)
query_vector = get_query_embedding(canonical)

print("Final Query for Vector Search:", canonical)
print("Embedding Vector (first 5 values):", query_vector[:5])  # just previewing







Final Query for Vector Search: does the plan include cataract surgery
Embedding Vector (first 5 values): [-0.02212147  0.00739667  0.00522298 -0.01729682  0.05961149]
