In [27]:
import re
from typing import Dict, Union, Tuple, Optional

def extract_numeric_constraints(query: str) -> Dict[str, Dict[str, Union[int, float]]]:
    constraints = {}

    # Expanded unit patterns for both USD and INR (including plurals and abbreviations)
    unit_pattern = r"(cr|crore|crores?|c|million|millions?|mil|m|billion|billions?|bil|b|lakhs?|lakh|l)?"

    # Patterns for currency/amount
    patterns = [
        (rf"(?:over|above|more than|greater than)\s*(₹|\$|usd|inr)?\s*([\d,.]+)\s*{unit_pattern}\s*(usd|inr)?", "gte"),
        (rf"(?:min(?:imum)?|at least)\s*(₹|\$|usd|inr)?\s*([\d,.]+)\s*{unit_pattern}\s*(usd|inr)?", "gte"),
        (rf"\b>\s*(₹|\$|usd|inr)?\s*([\d,.]+)\s*{unit_pattern}\s*(usd|inr)?", "gte"),
        (rf"(?:under|below|less than|smaller than)\s*(₹|\$|usd|inr)?\s*([\d,.]+)\s*{unit_pattern}\s*(usd|inr)?", "lte"),
        (rf"(?:max(?:imum)?|at most|upto)\s*(₹|\$|usd|inr)?\s*([\d,.]+)\s*{unit_pattern}\s*(usd|inr)?", "lte"),
        (rf"\b<\s*(₹|\$|usd|inr)?\s*([\d,.]+)\s*{unit_pattern}\s*(usd|inr)?", "lte"),
        (rf"between\s*(₹|\$|usd|inr)?\s*([\d,.]+)\s*{unit_pattern}\s*(usd|inr)?\s*and\s*(₹|\$|usd|inr)?\s*([\d,.]+)\s*{unit_pattern}\s*(usd|inr)?", "range"),
    ]

    # Patterns for employees (plain numbers)
    emp_patterns = [
        (r"(?:over|above|more than|greater than)\s*([\d,]+)\s*(employees|staff|team size|headcount)", "gte"),
        (r"(?:min(?:imum)?|at least)\s*([\d,]+)\s*(employees|staff|team size|headcount)", "gte"),
        (r"\b>\s*([\d,]+)\s*(employees|staff|team size|headcount)", "gte"),
        (r"(?:under|below|less than|smaller than)\s*([\d,]+)\s*(employees|staff|team size|headcount)", "lte"),
        (r"(?:max(?:imum)?|at most|upto)\s*([\d,]+)\s*(employees|staff|team size|headcount)", "lte"),
        (r"\b<\s*([\d,]+)\s*(employees|staff|team size|headcount)", "lte"),
        (r"between\s*([\d,]+)\s*(employees|staff|team size|headcount)\s*and\s*([\d,]+)\s*(employees|staff|team size|headcount)", "range"),
    ]

    field_keywords = {
        "total_funding_raised_inr": ["funding", "investment", "raised", "capital", "secured"],
        "valuation_estimate_if_available": ["valuation", "worth", "company value"],
        "revenue_estimate_annual": ["revenue", "income", "sales", "turnover"],
        "number_of_employees_current": ["employees", "staff", "team size", "headcount"],
        "year_founded": ["founded", "established", "startup year", "launch year"]
    }

    query_lower = query.lower()

    # Currency conversion rates (update as needed)
    CURRENCY_RATES = {
        "INR": 1,
        "USD": 8300000,  # 1 million USD = 8,300,000 INR
    }

    def parse_currency(symbol1: Optional[str], symbol2: Optional[str]) -> str:
        for s in (symbol1, symbol2):
            if not s:
                continue
            s = s.strip().replace("$", "USD").replace("₹", "INR").upper()
            if s in ("USD", "INR"):
                return s
        return "INR"  # Default

    def normalize_unit(value: str, unit: Optional[str], currency: str) -> Tuple[float, str, str]:
        num = float(value.replace(",", "").strip())
        unit = (unit or "").lower()
        # Normalize all unit variants for both INR and USD
        if currency == "USD":
            if unit in ["million", "millions", "mil", "m"]:
                num = num * CURRENCY_RATES["USD"]
            elif unit in ["billion", "billions", "bil", "b"]:
                num = num * 1000 * CURRENCY_RATES["USD"]
            else:
                num = num * CURRENCY_RATES["USD"]  # Assume base is million
        elif currency == "INR":
            if unit in ["cr", "crore", "crores", "c"]:
                num = num * 10000000
            elif unit in ["lakh", "lakhs", "l"]:
                num = num * 100000
            # else: assume INR base unit
        return round(num, 2), currency, unit

    # --- Funding/Revenue/Valuation Patterns ---
    for pattern, operator in patterns:
        for match in re.finditer(pattern, query_lower):
            span_start = match.start()
            matched_field = None
            nearest_keyword = float("inf")

            if operator == "range":
                s1, v1, u1, c1, s2, v2, u2, c2 = match.groups()
                currency1 = parse_currency(s1, c1)
                currency2 = parse_currency(s2, c2)
                val1, cur1, unit1 = normalize_unit(v1, u1, currency1)
                val2, cur2, unit2 = normalize_unit(v2, u2, currency2)
                low = val1
                high = val2
            else:
                s1, v1, u1, c1 = match.groups()
                currency = parse_currency(s1, c1)
                val, cur, unit = normalize_unit(v1, u1, currency)

            for field, keywords in field_keywords.items():
                for keyword in keywords:
                    for m_kw in re.finditer(re.escape(keyword), query_lower, flags=re.IGNORECASE):
                        distance = abs(m_kw.start() - span_start)
                        if distance < nearest_keyword and distance <= 50:
                            nearest_keyword = distance
                            matched_field = field

            if not matched_field:
                matched_field = "total_funding_raised_inr"

            if operator == "range":
                constraints.setdefault(matched_field, {}).update({
                    "gte": low,
                    "lte": high
                })
            else:
                constraints.setdefault(matched_field, {}).update({
                    operator: val
                })

    # --- Employee Patterns ---
    for pattern, operator in emp_patterns:
        for match in re.finditer(pattern, query_lower):
            if operator == "range":
                v1, _, v2, _ = match.groups()
                low = int(v1.replace(",", ""))
                high = int(v2.replace(",", ""))
                constraints.setdefault("number_of_employees_current", {}).update({
                    "gte": low,
                    "lte": high
                })
            else:
                v1, _ = match.groups()
                val = int(v1.replace(",", ""))
                constraints.setdefault("number_of_employees_current", {}).update({
                    operator: val
                })

    # --- Year Patterns ---
    year_after = re.search(r"(?:after|since)\s(\d{4})", query_lower)
    if year_after:
        constraints.setdefault("year_founded", {})["gte"] = int(year_after.group(1))

    year_before = re.search(r"(?:before|until|prior to)\s(\d{4})", query_lower)
    if year_before:
        constraints.setdefault("year_founded", {})["lte"] = int(year_before.group(1))

    return constraints


In [29]:
# ✅ Example usage:
if __name__ == "__main__":
    query = """

   "List startups with funding between ₹10 crore and ₹100 crore, revenue less than $20 mil, and above 50 employees."
    
    """
    print(extract_numeric_constraints(query))

{'number_of_employees_current': {'gte': 50}, 'revenue_estimate_annual': {'lte': 166000000.0}, 'total_funding_raised_inr': {'gte': 100000000.0, 'lte': 1000000000.0}}
