In [2]:
# -*- coding: utf-8 -*-

import re
import time
import json
import hashlib
import numpy as np
import spacy
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from spacy.matcher import DependencyMatcher
from typing import List, Dict, Any, Set

# ==========================================
# 1. USER PROVIDED VALIDATORS (STRICT LOGIC)
# ==========================================
class Validators:
    IGNORED_FILE_EXTENSIONS = {
        'pdf', 'jpg', 'png', 'gif', 'doc', 'docx', 'xls', 'xlsx',
        'txt', 'zip', 'rar', 'exe', 'mp3', 'mp4', 'json', 'xml', 'js', 'py'
    }

    # Common non-PII words that match ID patterns (Rubbish filter)
    # Expanded with false positives from BIC analysis
    GERMAN_RUBBISH = {
        "kannst", "helfen", "beantworten", "verstopft", "freundlich", "formulieren", 
        "morgen", "zwischen", "kommen", "folgene", "reingekommen", "antwort", 
        "geehrte", "herren", "mitarbeiter", "leider", "melden", "nettes", 
        "brauche", "vorlage", "wohnhaft", "heisst", "nachricht", "dringend",
        "stunde", "vertrag", "tabelle", "eintrag", "arbeitge", "adresse",
        "rechnung", "folgende", "bestimmt", "ordentliche", "einfache",
        "lastschrift", "gesamtsumme", "erfragen", "kopieren", "kowalski", 
        "belasten", "ueberweisen", "kreditkarte", "erweiterung", "einzuziehen", 
        "bergmann", "erteilen", "wechselt", "vorbereiten", "lieferanten", "geschrieben"
    }

    SAFE_TLDS = {
        'com', 'net', 'org', 'info', 'biz', 'co', 'io', 'me', 'edu', 'gov', 'int', 'mil',
        'de', 'at', 'ch', 'eu', 'nl', 'fr', 'uk', 'be', 'dk', 'no', 'se', 'fi', 'pl', 'it', 'es',
        'app', 'dev', 'ai', 'cloud', 'tech', 'digital', 'studio', 'online', 'shop', 'store',
        'berlin', 'hamburg', 'koeln', 'bayern'
    }

    # ISO 3166-1 alpha-2 country codes for BIC validation
    ISO_COUNTRY_CODES = {
        "AD", "AE", "AF", "AG", "AI", "AL", "AM", "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ",
        "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS",
        "BT", "BV", "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", "CH", "CI", "CK", "CL", "CM", "CN",
        "CO", "CR", "CU", "CV", "CW", "CX", "CC", "CO", "KM", "CG", "CD", "CK", "CR", "CI", "HR", "CU", 
        "CW", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "EH", "ER", "ES", "ET", 
        "FI", "FJ", "FK", "FM", "FO", "FR", "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", "GM", 
        "GN", "GP", "GQ", "GR", "GS", "GT", "GU", "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", "ID", 
        "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS", "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", 
        "KI", "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", 
        "LU", "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", "ML", "MM", "MN", "MO", "MP", 
        "MQ", "MR", "MS", "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", "NC", "NE", "NF", "NG", "NI", 
        "NL", "NO", "NP", "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", "PH", "PK", "PL", "PM", "PN", 
        "PR", "PS", "PT", "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", "SB", "SC", "SD", "SE", 
        "SG", "SH", "SI", "SJ", "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV", "SX", "SY", "SZ", 
        "TC", "TD", "TF", "TG", "TH", "TJ", "TK", "TL", "TM", "TN", "TR", "TT", "TV", "TW", "TZ", "UA", 
        "UG", "UM", "US", "UY", "UZ", "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", "WS", "YE", "YT", 
        "ZA", "ZM", "ZW"
    }

    
    LOCATION_CONTEXT_WORDS = {
        "anschrift", "adresse", "wohnt", "wohnen", "wohnhaft", "ging", "an", "der", "die", "das", 
        "in", "im", "er", "sie", "es", "wir", "ihr", "ist", "sind", "war", "waren", "heißt", 
        "heist", "objekt", "einsatzort", "beim", "kunden", "eltern", "seinen", "ihren", 
        "meine", "unser", "unsere", "nämlich", "namens", "heißt", "name", "herr", "frau",
        "auftraggeber", "rechnung", "rechnungsadresse", "standort", "baustelle",
        "anschrift", "adresse", "wohnt", "wohnen", "wohnhaft", "ging", "an", "der", "die", "das", 
        "in", "im", "er", "sie", "es", "wir", "ihr", "ist", "sind", "war", "waren", "heißt", 
        "heist", "objekt", "einsatzort", "beim", "kunden", "eltern", "seinen", "ihren", 
        "meine", "unser", "unsere", "nämlich", "namens", "heißt", "name", "herr", "frau",
        "auftraggeber", "rechnung", "rechnungsadresse", "standort", "baustelle",
        "dr.", "prof.", "arzt", "ärztin", "anwalt", "familie"
    }

    

    @staticmethod
    def normalize(text: str) -> str:
        """Removes spaces, dashes, dots, parens for validation."""
        return re.sub(r"[\s\-\./\(\)\\]", "", text)

    @staticmethod
    def fix_common_typos(text: str) -> str:
        """Fixes common OCR/Typo errors specifically for financial strings."""
        return text.upper().replace('O', '0').replace('I', '1').replace('S', '5')

    # --- PHONE VALIDATION ---
    @staticmethod
    def validate_phone(text: str) -> bool:
        if re.search(r"\d{2}\.\d{2}\.\-\d{2}\.\d{2}", text): return False
        if re.search(r"\d{1,2}[\.\/]\d{1,2}[\.\/]\d{2,4}", text): return False
        
        clean = text.lower().replace('o', '0').replace('l', '1')
        clean = re.sub(r"onal|abortel", "", clean)
        clean = re.sub(r"[\s\-\./\(\)\\]", "", clean)
        
        if len(clean) < 7 or len(clean) > 15: return False
        if len(clean) == 8 and ('.' in text or '/' in text):
            if re.search(r"[01]\d[\.\/][12]\d{3}", text) or re.search(r"[12]\d{3}[\.\/][01]\d", text):
                return False
        if len(set(clean)) <= 2 and len(clean) > 9: return False 
        if re.search(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", text): return False 
        return True

    # --- EMAIL VALIDATION ---
    @staticmethod
    def validate_email(text: str) -> bool:
        if "@" not in text: return False
        parts = text.split('.')
        return len(parts[-1]) >= 2 and len(parts[-1]) <= 8

    # --- URL VALIDATION ---
    @staticmethod
    def validate_url(text: str) -> bool:
        if "@" in text: return False
        if text.lower().startswith(('http:', 'https:', 'www.')): return True
        domain_part = text.split('/')[0]
        parts = domain_part.split('.')
        if len(parts) < 2: return False
        valid_tld = any(part.lower() in Validators.SAFE_TLDS for part in parts[-2:])
        if not valid_tld: return False
        last = parts[-1].lower()
        return not (len(last) < 2 or last.isdigit() or last in Validators.IGNORED_FILE_EXTENSIONS)

    # --- IBAN VALIDATION ---
    @staticmethod
    def validate_iban(text: str) -> bool:
        def check_sum(clean_text):
            if "ZZZ" in clean_text.upper(): return False
            if len(clean_text) < 15 or len(clean_text) > 34: return False
            try:
                rearranged = clean_text[4:] + clean_text[:4]
                numeric_iban = ""
                for char in rearranged:
                    if char.isalpha(): numeric_iban += str(ord(char) - 55)
                    elif char.isdigit(): numeric_iban += char
                    else: return False
                return int(numeric_iban) % 97 == 1
            except: return False
        clean = Validators.normalize(text).upper()
        if check_sum(clean): return True
        fixed = Validators.fix_common_typos(clean)
        return check_sum(fixed)

    # --- CARD VALIDATION ---
    @staticmethod
    def validate_card(text: str) -> bool:
        clean = Validators.normalize(text)
        if not clean.isdigit() or not (13 <= len(clean) <= 19) or clean.startswith('0'): 
            return False
        digits = [int(d) for d in clean]
        checksum = 0
        for i, digit in enumerate(reversed(digits)):
            if i % 2 == 1:
                doubled = digit * 2
                checksum += doubled if doubled < 10 else doubled - 9
            else: checksum += digit
        return checksum % 10 == 0

    @staticmethod
    def validate_id(text: str, id_type: str) -> bool:
        clean = Validators.normalize(text)
        
        # --- RUBBISH FILTER ---
        if clean.lower() in Validators.GERMAN_RUBBISH: return False
        if clean.isalpha() and len(clean) < 12: 
            if clean.lower() in Validators.GERMAN_RUBBISH: return False

        if id_type == "PII:ID:TAX":
            clean_tax = clean.upper()
            if clean_tax.startswith("DE"):
                body = clean_tax[2:]
                return len(body) == 9 and body.isdigit()
            return len(clean_tax) >= 10 and len(clean_tax) <= 13 and clean_tax.isdigit()

        if id_type == "PII:ID:UST":
            clean_ust = clean.upper()
            if not clean_ust.startswith("DE"): return False
            body = clean_ust[2:]
            return len(body) == 9 and body.isdigit()

        if id_type == "PII:ID:INSURANCE":
            return len(clean) == 10 and clean[0].isalpha() and clean[1:].isdigit()

        if id_type == "PII:ID:SVN": return 9 <= len(clean) <= 15
        
        if id_type == "PII:ID:DRIVERLICENSE":
            return 9 <= len(clean) <= 11 and any(c.isdigit() for c in clean) and any(c.isalpha() for c in clean)
        
        if id_type == "PII:ID:PASSPORT":
            if clean.isalpha(): return False
            return 6 <= len(clean) <= 12
            
        if id_type == "PII:ID:NATIONAL": return 6 <= len(clean) <= 12
        
        if id_type == "PII:FINANCIAL:BIC":
            clean_bic = clean.upper()
            if len(clean_bic) not in [8, 11]: return False
            if clean.lower().endswith(('schrift', 'summe', 'bereit', 'ieren', 'ung')): return False
            if not clean_bic[:4].isalpha(): return False
            country_code = clean_bic[4:6]
            if country_code not in Validators.ISO_COUNTRY_CODES: return False
            if text[0].isupper() and text[1:].islower() and text.isalpha(): return False
            return True
            
        return True

# ==========================================
# 2. CONTEXT VALIDATORS (AI FALLBACK)
# ==========================================
class IBANContextValidator:
    def __init__(self, threshold=0.18):
        self.threshold = threshold
        self.pos_anchors = ["IBAN", "Konto", "Bankverbindung", "Überweisung", "SEPA", "BIC", "Bank", "Kontodaten", "Zahlung an"]
        self.neg_anchors = ["Telefon", "Handy", "Fax", "ID", "Pass", "Ausweis", "Steuer", "SV-Nr"]
        self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
        self.vectorizer.fit(self.pos_anchors + self.neg_anchors)
        self.pos_vectors = self.vectorizer.transform(self.pos_anchors)

    def is_valid_context(self, text: str, start: int, end: int) -> bool:
        window = text[max(0, start-40):min(len(text), end+40)].lower()
        input_vec = self.vectorizer.transform([window])
        pos_score = float(np.max(cosine_similarity(input_vec, self.pos_vectors)))
        return pos_score > self.threshold

class CardContextValidator:
    def __init__(self, threshold=0.25):
        self.threshold = threshold
        self.pos_anchors = ["Kreditkarte", "Mastercard", "Visa", "Amex", "American Express", "Karteninhaber", "Gültigkeit", "endend auf", "ending in", "Ablaufdatum", "Zahlung", "Karte", "Credit Card", "Girocard", "EC-Karte"]
        self.neg_anchors = ["Geburtstag", "Telefon", "Hausnummer", "PLZ", "Postleitzahl", "Jahr", "Uhrzeit", "Euro", "EUR", "PIN", "CVV", "CVC", "Prüfziffer", "Code", "TAN", "IBAN", "Bic", "Tel", "Fax"]
        self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
        self.vectorizer.fit(self.pos_anchors + self.neg_anchors)
        self.pos_vectors = self.vectorizer.transform(self.pos_anchors)
        self.neg_vectors = self.vectorizer.transform(self.neg_anchors)

    def is_valid_context(self, text: str, start: int, end: int) -> bool:
        window = text[max(0, start-60):min(len(text), end+60)].lower()
        input_vec = self.vectorizer.transform([window])
        pos_score = float(np.max(cosine_similarity(input_vec, self.pos_vectors)))
        neg_score = float(np.max(cosine_similarity(input_vec, self.neg_vectors)))
        return pos_score > self.threshold and pos_score > neg_score

class PhoneContextValidator:
    def __init__(self, threshold=0.08):
        self.threshold = threshold
        self.pos_anchors = ["tel", "telefon", "phone", "mobil", "handy", "fon", "nummer", "nr", "rückruf", "contact", "kontakt", "anrufen", "angerufen", "durchwahl", "mobile", "erreichbar", "unter", "ansprechpartner"]
        self.neg_anchors = ["laufzeit", "zeitraum", "datum", "iban", "bic", "steuer", "id", "konto", "bank", "betrag", "euro", "eur", "plz", "gesamtsumme"]
        self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
        self.vectorizer.fit(self.pos_anchors + self.neg_anchors)
        self.pos_vectors = self.vectorizer.transform(self.pos_anchors)
        self.neg_vectors = self.vectorizer.transform(self.neg_anchors)

    def is_valid_context(self, text: str, start: int, end: int) -> bool:
        window = text[max(0, start-60):min(len(text), end+60)].lower()
        if any(kw in window for kw in ["tel", "mobil", "handy", "fon", "nummer", "nr."]): return True
        clean_num = re.sub(r"[^0-9]", "", text[start:end])
        if clean_num.startswith("01") and 10 <= len(clean_num) <= 13: return True
            
        input_vec = self.vectorizer.transform([window])
        pos_score = float(np.max(cosine_similarity(input_vec, self.pos_vectors)))
        neg_score = float(np.max(cosine_similarity(input_vec, self.neg_vectors)))
        return pos_score > self.threshold and (pos_score >= neg_score or pos_score > 0.20)

class PassportContextValidator:
    def __init__(self, threshold=0.18):
        self.threshold = threshold
        self.pos_anchors = ["Pass", "Reisepass", "Passport", "Passnummer", "Pass-Nr", "Pass No", "Visa", "Nationalität", "Dokument"]
        self.neg_anchors = [
            "IBAN", "BIC", "Konto", "Bank", "Euro", "Zahlung", "Lastschrift",
            "Telefon", "Handy", "Tel", "Mobil", "Nummer", "Rückruf",
            "Jahre", "alt", "geboren", "geb.", "Geburtsdatum",
            "Führerschein", "License", "FS-Nr", "Klasse", "Fahrer",
            "Versicherung", "SV-Nr", "Krankenkasse", "Steuer-ID"
        ]
        self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
        self.vectorizer.fit(self.pos_anchors + self.neg_anchors)
        self.pos_vectors = self.vectorizer.transform(self.pos_anchors)
        self.neg_vectors = self.vectorizer.transform(self.neg_anchors)

    def is_valid_context(self, text: str, start: int, end: int, candidate: str) -> bool:
        pre_window = text[max(0, start-25):start].lower()
        if "pass" in pre_window or "reise" in pre_window: return True
        window = text[max(0, start-60):min(len(text), end+60)].lower()
        input_vec = self.vectorizer.transform([window])
        pos_score = float(np.max(cosine_similarity(input_vec, self.pos_vectors)))
        neg_score = float(np.max(cosine_similarity(input_vec, self.neg_vectors)))
        if candidate.isdigit(): return pos_score > (self.threshold * 1.5) and pos_score > neg_score
        return pos_score > self.threshold and pos_score > neg_score

class SVNContextValidator:
    def __init__(self, threshold=0.15):
        self.threshold = threshold
        self.pos_anchors = ["SV-Nummer", "SV-Nr", "Sozialversicherung", "Rentenversicherung", "Versicherungsnummer", "RV-NR", "SVNR"]
        self.neg_anchors = ["IBAN", "Konto", "Pass", "Telefon", "Handy", "Steuer", "Tax"]
        self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
        self.vectorizer.fit(self.pos_anchors + self.neg_anchors)
        self.pos_vectors = self.vectorizer.transform(self.pos_anchors)

    def is_valid_context(self, text: str, start: int, end: int) -> bool:
        window = text[max(0, start-50):min(len(text), end+50)].lower()
        if any(kw in window for kw in ["sv-nr", "svnr", "rv-nr", "sozialversicherung"]): return True
        input_vec = self.vectorizer.transform([window])
        pos_score = float(np.max(cosine_similarity(input_vec, self.pos_vectors)))
        return pos_score > self.threshold

class TaxContextValidator:
    def __init__(self, threshold=0.20):
        self.threshold = threshold
        self.pos_anchors = ["Steuer-ID", "Steuernummer", "St-ID", "USt-IdNr", "Finanzamt", "Einkommensteuer", "Steuererklärung", "Steuer-Nr", "Identifikationsnummer", "Steueridentifikationsnummer", "Steuernr"]
        self.neg_anchors = ["Telefon", "Handy", "Tel", "Mobil", "Fax", "IBAN", "BIC", "Konto", "Bank", "Pass", "Ausweis", "SV-Nr", "Sozialversicherung", "Krankenkasse"]
        
        self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
        self.vectorizer.fit(self.pos_anchors + self.neg_anchors)
        self.pos_vectors = self.vectorizer.transform(self.pos_anchors)
        self.neg_vectors = self.vectorizer.transform(self.neg_anchors)

    def is_valid_context(self, text: str, start: int, end: int, candidate: str) -> bool:
        pre_window = text[max(0, start-35):start].lower()
        if any(kw in pre_window for kw in ["steuer", "id-nr", "ust-idnr", "st-nr", "identifikationsnummer"]): return True
        
        window = text[max(0, start-70):min(len(text), end+70)].lower()
        input_vec = self.vectorizer.transform([window])
        pos_score = float(np.max(cosine_similarity(input_vec, self.pos_vectors)))
        neg_score = float(np.max(cosine_similarity(input_vec, self.neg_vectors)))
        
        if Validators.normalize(candidate).isdigit():
            return pos_score > (self.threshold * 1.5) and pos_score > neg_score
        return pos_score > self.threshold and pos_score > neg_score

class BICContextValidator:
    def __init__(self, threshold=0.40):
        self.threshold = threshold
        self.pos_anchors = ["BIC", "SWIFT", "Bank-Code", "Bankverbindung", "Überweisung", "Bankdaten", "Kreditinstitut", "Zahlungsempfänger"]
        self.neg_anchors = ["Thorsten", "Monika", "Detlef", "rechnung", "folgende", "bestimmt", "muss", "neue", "Frau", "Herr", "Mitarbeiter", "Kunde", "Name", "Telefon", "Handy", "Adresse", "Abwasser", "Kaminski", "Hartmann", "Lastschrift", "Gesamtsumme", "erfragen", "kopieren", "Kowalski", "belasten", "Kreditkarte", "Erweiterung", "Bergmann", "wechselt"]
        
        self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
        self.vectorizer.fit(self.pos_anchors + self.neg_anchors)
        self.pos_vectors = self.vectorizer.transform(self.pos_anchors)
        self.neg_vectors = self.vectorizer.transform(self.neg_anchors)

    def is_valid_context(self, text: str, start: int, end: int, candidate: str) -> bool:
        window_lower = text[max(0, start-50):min(len(text), end+50)].lower()
        if any(kw in window_lower for kw in ["bic", "swift", "bank-code"]): return True
        
        input_vec = self.vectorizer.transform([window_lower])
        pos_score = float(np.max(cosine_similarity(input_vec, self.pos_vectors)))
        neg_score = float(np.max(cosine_similarity(input_vec, self.neg_vectors)))
        
        if candidate[0].isupper() and candidate[1:].islower():
            return pos_score > 0.65 and pos_score > neg_score
            
        return pos_score > self.threshold and pos_score > neg_score

class NationalContextValidator:
    def __init__(self, threshold=0.18):
        self.threshold = threshold
        self.pos_anchors = ["Ausweis", "Personalausweis", "National ID", "ID-Nr", "Identitätskarte", "Ausweisnummer", "Perso", "Dokumentnummer"]
        self.neg_anchors = ["Führerschein", "License", "Driver", "Klasse", "Fahrerlaubnis", "Fahrzeug", "IBAN", "Konto", "Telefon", "SV-Nr"]
        
        self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
        self.vectorizer.fit(self.pos_anchors + self.neg_anchors)
        self.pos_vectors = self.vectorizer.transform(self.pos_anchors)
        self.neg_vectors = self.vectorizer.transform(self.neg_anchors)

    def is_valid_context(self, text: str, start: int, end: int, candidate: str) -> bool:
        pre_window = text[max(0, start-25):start].lower()
        if any(kw in pre_window for kw in ["ausweis", "perso", "id-nr"]): return True
        
        window = text[max(0, start-60):min(len(text), end+60)].lower()
        input_vec = self.vectorizer.transform([window])
        pos_score = float(np.max(cosine_similarity(input_vec, self.pos_vectors)))
        neg_score = float(np.max(cosine_similarity(input_vec, self.neg_vectors)))
        
        if candidate.isdigit():
            return pos_score > (self.threshold * 1.5) and pos_score > neg_score
        return pos_score > self.threshold and pos_score > neg_score

# ==========================================
# ADDED CONTEXT VALIDATORS FROM CODE 2
# ==========================================
class InsuranceContextValidator(ContextValidatorBase := object): 
    # Defined within RegexPIIDetector context style of Code 1
    def __init__(self, threshold=0.35):
        self.threshold = threshold
        self.pos_anchors = ["KV-Nr", "Krankenkasse", "Versicherung", "Mitgliedsnummer", "Gesundheitskarte", "AOK", "Techniker", "Barmer", "DAK", "Versichertennummer", "IKK", "HUK", "Signal Iduna", "Allianz"]
        self.neg_anchors = ["USt-IdNr", "Umsatzsteuer", "IBAN", "Steuernummer", "Führerschein", "Bankverbindung", "BIC", "SWIFT"]
        self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
        self.vectorizer.fit(self.pos_anchors + self.neg_anchors)
        self.pos_vectors = self.vectorizer.transform(self.pos_anchors)
        self.neg_vectors = self.vectorizer.transform(self.neg_anchors)
    def is_valid_context(self, text, start, end):
        window = text[max(0, start-60):min(len(text), end+60)].lower()
        input_vec = self.vectorizer.transform([window])
        pos_score = float(np.max(cosine_similarity(input_vec, self.pos_vectors)))
        neg_score = float(np.max(cosine_similarity(input_vec, self.neg_vectors)))
        return pos_score > self.threshold and pos_score > neg_score

class DriverLicenseContextValidator:
    def __init__(self, threshold=0.40):
        self.threshold = threshold
        self.pos_anchors = ["Führerschein", "Klasse", "FS-Nr", "Fahrerlaubnis", "beantragt", "ausgestellt", "Fahrerkarte", "Pappe", "Listen-Nr", "Fahrberechtigung"]
        self.neg_anchors = ["USt-IdNr", "DE", "Bank", "IBAN", "Steuer", "Umsatzsteuer", "Krankenkasse", "Versichertennr", "Mitgliedsnummer", "Reisepass", "Pass", "Passport", "Personalausweis", "Ausweis", "Identitätskarte"]
        self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
        self.vectorizer.fit(self.pos_anchors + self.neg_anchors)
        self.pos_vectors = self.vectorizer.transform(self.pos_anchors)
        self.neg_vectors = self.vectorizer.transform(self.neg_anchors)
    def is_valid_context(self, text, start, end):
        window = text[max(0, start-60):min(len(text), end+60)].lower()
        input_vec = self.vectorizer.transform([window])
        pos_score = float(np.max(cosine_similarity(input_vec, self.pos_vectors)))
        neg_score = float(np.max(cosine_similarity(input_vec, self.neg_vectors)))
        return pos_score > self.threshold and pos_score > neg_score

class DriverPlateContextValidator:
    def __init__(self, threshold=0.30):
        self.threshold = threshold
        self.pos_anchors = ["Kennzeichen", "Nummernschild", "Auto", "PKW", "Fahrzeug", "geblitzt", "Unfall", "Halter", "Zulassung", "abgemeldet", "Parkplatz", "Falschparker", "LKW"]
        self.neg_anchors = ["Geburtsdatum", "IBAN", "Telefon", "ID-Nr", "Versichertennummer", "USt-ID"]
        self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
        self.vectorizer.fit(self.pos_anchors + self.neg_anchors)
        self.pos_vectors = self.vectorizer.transform(self.pos_anchors)
        self.neg_vectors = self.vectorizer.transform(self.neg_anchors)
    def is_valid_context(self, text, start, end):
        window = text[max(0, start-60):min(len(text), end+60)].lower()
        input_vec = self.vectorizer.transform([window])
        pos_score = float(np.max(cosine_similarity(input_vec, self.pos_vectors)))
        neg_score = float(np.max(cosine_similarity(input_vec, self.neg_vectors)))
        return pos_score > self.threshold and pos_score > neg_score

class USTContextValidator:
    def __init__(self, threshold=0.40):
        self.threshold = threshold
        self.pos_anchors = ["USt-IdNr", "USt-ID", "Umsatzsteuer", "innergemeinschaftlich", "Finanzamt", "Rechnung", "VAT ID", "Steuernummer", "unternehmerisch"]
        self.neg_anchors = ["Führerschein", "Auto", "Fahrer", "Unfall", "Krankenkasse", "Kennzeichen", "Fahrzeug"]
        self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
        self.vectorizer.fit(self.pos_anchors + self.neg_anchors)
        self.pos_vectors = self.vectorizer.transform(self.pos_anchors)
        self.neg_vectors = self.vectorizer.transform(self.neg_anchors)
    def is_valid_context(self, text, start, end):
        window = text[max(0, start-60):min(len(text), end+60)].lower()
        input_vec = self.vectorizer.transform([window])
        pos_score = float(np.max(cosine_similarity(input_vec, self.pos_vectors)))
        neg_score = float(np.max(cosine_similarity(input_vec, self.neg_vectors)))
        return pos_score > self.threshold and pos_score > neg_score




class PINContextValidator:
    def __init__(self, threshold=0.65):
        self.threshold = threshold
        # Positive anchors based on your CSV data
        self.pos_anchors = ["CVV", "CVC", "CVV2", "CVC2", "Security Code"]
        # Heavy negative anchors to block house numbers and dates
        self.neg_anchors = ["Marktstraße", "Straße", "PLZ", "Hausnummer", "Jahr", "Datum", "Euro", "EUR", "Summe", "Telefon", "Tel", "Uhr", "Minuten", "Januar", "Februar"]
        self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
        self.vectorizer.fit(self.pos_anchors + self.neg_anchors)
        self.pos_vectors = self.vectorizer.transform(self.pos_anchors)
        self.neg_vectors = self.vectorizer.transform(self.neg_anchors)

    def is_valid_context(self, text: str, start: int, end: int, candidate: str) -> bool:
        # STRICT RULE: Look for "CVV", "CVC", or "Prüf" within 15 characters
        proximity_window = text[max(0, start-15):min(len(text), end+15)].lower()
        if re.search(r'\b(cvv|cvc|prüf|security|kartenprüf)\b', proximity_window):
            return True

        # AI FALLBACK: For broader sentences, POS score must be high and beat NEG score
        window = text[max(0, start-50):min(len(text), end+50)].lower()
        input_vec = self.vectorizer.transform([window])
        pos_score = float(np.max(cosine_similarity(input_vec, self.pos_vectors)))
        neg_score = float(np.max(cosine_similarity(input_vec, self.neg_vectors)))
        
        return pos_score > self.threshold and pos_score > neg_score

# ==========================================
# 3. USER PROVIDED DETECTOR
# ==========================================
class RegexPIIDetector:
    def __init__(self, iban_validator=None, card_validator=None, phone_validator=None, passport_validator=None, svn_validator=None, national_validator=None, tax_validator=None, bic_validator=None, insurance_validator=None, license_validator=None, plate_validator=None, ust_validator=None, pin_validator=None):
        self.iban_validator = iban_validator
        self.card_validator = card_validator
        self.phone_validator = phone_validator
        self.passport_validator = passport_validator
        self.svn_validator = svn_validator
        self.national_validator = national_validator
        self.tax_validator = tax_validator
        self.bic_validator = bic_validator
        self.insurance_validator = insurance_validator
        self.license_validator = license_validator
        self.plate_validator = plate_validator
        self.ust_validator = ust_validator
        self.pin_validator = pin_validator
        
        
        self.patterns = {
            "PII:FINANCIAL:IBAN": re.compile(r"\b[A-Z]{2}\d{2}(?:[\s\.\-]*[A-Z0-9]){11,35}\b", re.IGNORECASE),
            "PII:FINANCIAL:BIC": re.compile(r"\b[A-Za-z]{4}[A-Za-z]{2}[A-Za-z0-9]{2}(?:[A-Za-z0-9]{3})?\b"),
            "PII:FINANCIAL:CARD": re.compile(r"\b[1-9](?:[\s\-\–\/\.]*\d){12,18}\b"),
            "PII:FINANCIAL:CARD_PARTIAL_INTERNAL": re.compile(r"(?i)(?:visa|mastercard|amex|girocard|kreditkarte|karte|endend|ending)\s*(?:auf|in|no|nr)?\s*(?::|#)?\s*\b(\d{4})\b"),
            #"PII:CONTACT:URL": re.compile(r"\b(?:(?:https?://|www\.)[\w\-\.\/%\+~=\?&]+|[\w\-]+(?:\.[a-zA-Z]{2,})+(?:/[\w\-\.\/%\+~=\?&]*)?)\b", re.IGNORECASE),
            "PII:CONTACT:PHONE": re.compile(r"(?i)(?<![a-zA-Z0-9\.])(?:(?:(?:\+|00|[o0])[1-9]\d{0,4})[\s\.\-\/\\]*(?:\(\s*[o0]\s*\)\s*)?|(?:\(\s*[o0][1-9]\d{1,8}\s*\)|[o0][1-9]\d{1,8}))(?:[ \t\.\-\/\\]*(?:onal|abortel)?[ \t\.\-\/\\]*\d){3,15}[l1]?(?![a-zA-Z0-9])"),
            "PII:CONTACT:EMAIL": re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}(?=[^a-z]|$)"),
            "PII:ID:UST": re.compile(r"\bDE(?:[\s\.\-]*\d){9}\b", re.IGNORECASE),
            "PII:ID:INSURANCE": re.compile(r"\b[A-Z](?:[\s\.\-]*\d){9}\b", re.IGNORECASE),
            "PII:DRIVERPLATE": re.compile(r"\b[A-ZÄÖÜ]{1,3}-[A-Z]{1,2}\s?\d{1,4}[EH]?\b", re.IGNORECASE),
            "PII:ID:TAX": re.compile(r"\b(?:\d{11}|\d{2}[\s\-]\d{3}[\s\-]\d{3}[\s\-]\d{3}|\d{2,3}/\d{3,4}/\d{4,5}|DE[\s]?\d{9})\b", re.IGNORECASE),
            "PII:ID:SVN": re.compile(r"\b\d{2}[\s]*\d{6}[\s]*[A-Z][\s]*\d{3}\b", re.IGNORECASE),
            "PII:ID:DRIVERLICENSE": re.compile(r"\b[A-Z0-9]{9,11}\b", re.IGNORECASE),
            "PII:ID:PASSPORT": re.compile(r"\b[A-Z0-9]{7,11}\b", re.IGNORECASE),
            "PII:ID:NATIONAL": re.compile(r"\b[L-Z0-9][0-9A-Z]{8,9}\b"),
            "PII:PIN": re.compile(r"\b\d{3,4}\b"),
            "PII:LOCATION:ADDRESS": re.compile(
                r"\b(?:[Aa]m|[Zz]um|[Aa]n der|[Ii]n der|[Aa]uf dem|[Bb]ei)?\s?"
                r"(?:[A-ZÄÖÜ][a-zäöüßA-Z0-9\-]*\s*){1,3}"
                r"(?:straße|strasse|str\.|weg|platz|allee|damm|ring|gasse|ufer|chaussee|hof|garten|markt|zeile|wall|graben|kirchweg|landstraße|pfad|autobahn|bundesstraße|zubringer)"
                r"(?:\s*(?:[Hh]ausnr\.?\s*)?([A-Z]?\d{1,4}[a-z]?|A\d+|B\d+))?\b", 
                re.IGNORECASE
            ),            
            "PII:LOCATION:POSTALCODE": re.compile(r"\b(?:PLZ\s+)?\d{5}\b", re.IGNORECASE),
        }

    def _validate_match(self, pii_type: str, text: str) -> bool:
        if pii_type.startswith("PII:ID") or pii_type == "PII:FINANCIAL:BIC" or pii_type == "PII:DRIVERPLATE": 
            return Validators.validate_id(text, pii_type)
        #if pii_type == "PII:CONTACT:URL": return Validators.validate_url(text)
        if pii_type == "PII:CONTACT:PHONE": return Validators.validate_phone(text)
        elif pii_type == "PII:CONTACT:EMAIL": return Validators.validate_email(text)
        elif pii_type == "PII:FINANCIAL:IBAN": return Validators.validate_iban(text)
        elif pii_type == "PII:FINANCIAL:CARD": return Validators.validate_card(text)
        return True

    def _resolve_conflicts(self, detections: List[Dict]) -> List[Dict]:
        if not detections: return []
        priority = {
            "PII:FINANCIAL:IBAN": 100, 
            "PII:FINANCIAL:CARD": 95, 
            "PII:FINANCIAL:BIC": 95, 
            "PII:ID:UST": 95, 
            "PII:PIN": 94,
            "PII:LOCATION:ADDRESS": 92,     
            "PII:ID:SVN": 90, 
            "PII:ID:INSURANCE": 88,      
            "PII:ID:DRIVERLICENSE": 87,  
            "PII:DRIVERPLATE": 86,      
            "PII:ID:PASSPORT": 85, 
            "PII:ID:NATIONAL": 85, 
            "PII:ID:TAX": 80, 
            "PII:CONTACT:EMAIL": 70,
            #"PII:CONTACT:URL": 50, 
            "PII:CONTACT:PHONE": 10,
            "PII:LOCATION:POSTALCODE": 8,
        }

        # 2. THIS IS THE SPECIFIC LINE:
        # It sorts by Priority FIRST, then by the LENGTH of the detected string SECOND.
        sorted_dets = sorted(detections, key=lambda x: (-priority.get(x['type'], 0), -(x['end'] - x['start'])))

        # 3. Collision logic remains exactly as it was in Code 1
        final, occupied = [], set()
        for det in sorted_dets:
            r = range(det['start'], det['end'])
            if not any(i in occupied for i in r):
                final.append(det)
                occupied.update(r)
        
        return sorted(final, key=lambda x: x['start'])

    def detect(self, text: str) -> List[Dict]:
        raw_detections = []
        for pii_type, pattern in self.patterns.items():
            for match in pattern.finditer(text):
                is_valid = False
                start_pos, end_pos = match.start(), match.end()
                detection_type = pii_type
                
                if pii_type == "PII:FINANCIAL:CARD_PARTIAL_INTERNAL":
                    detection_type = "PII:FINANCIAL:CARD"
                    original_digits = match.group(1)
                    clean_text = f"****{original_digits}" 
                    start_pos, end_pos = match.start(1), match.end(1)
                    if self.card_validator and self.card_validator.is_valid_context(text, start_pos, end_pos):
                        is_valid = True
                        
                
                
                elif pii_type == "PII:FINANCIAL:IBAN":
                    candidate = match.group()
                    parts = re.split(r'(\s+)', candidate)
                    stop_labels = {"BIC", "STOP", "ICH", "NAME", "GMBH", "IBAN", "AN", "AUF", "AM", "BIN", "WAR", "VON", "UND", "DIE", "IST", "DER", "DAS", "DAME", "HERR", "WIR", "IHR", "SEIN", "MIT", "BEI"}
                    valid_parts = []
                    word_idx = 0
                    for p in parts:
                        if not p.strip(): valid_parts.append(p); continue
                        t = p.strip(".,;:!?() ")
                        if not t: continue
                        if t.upper() in stop_labels or "ZZZ" in t.upper(): break
                        if t[0].isupper() and any(c.islower() for c in t): break
                        if word_idx > 0 and not any(c.isdigit() for c in t) and len(t) != 4: break
                        valid_parts.append(p); word_idx += 1
                    clean_text = "".join(valid_parts).strip(".,;:!? ")
                    end_pos = match.start() + len(clean_text)
                    if Validators.validate_iban(clean_text): is_valid = True
                    elif self.iban_validator and self.iban_validator.is_valid_context(text, match.start(), end_pos):
                        if 15 <= len(Validators.normalize(clean_text)) <= 34: is_valid = True

                elif pii_type == "PII:FINANCIAL:CARD":
                    if match.start() > 0 and text[match.start()-1] == '+': continue
                    clean_text = match.group().strip(".,;:!? ")
                    end_pos = match.start() + len(clean_text)
                    if Validators.validate_card(clean_text): is_valid = True
                    elif self.card_validator and self.card_validator.is_valid_context(text, match.start(), end_pos): is_valid = True
                
                elif pii_type == "PII:ID:DRIVERLICENSE":
                    clean_text = match.group().strip(".,;:!? \n\r\t")
                    end_pos = match.start() + len(clean_text)
                    if Validators.validate_id(clean_text, pii_type) and not clean_text.upper().startswith("DE"):
                        if self.license_validator and self.license_validator.is_valid_context(text, match.start(), end_pos):
                            is_valid = True

                elif pii_type == "PII:ID:INSURANCE":
                    clean_text = match.group().strip(".,;:!? \n\r\t")
                    end_pos = match.start() + len(clean_text)
                    if Validators.validate_id(clean_text, pii_type):
                        if self.insurance_validator and self.insurance_validator.is_valid_context(text, match.start(), end_pos):
                            is_valid = True

                elif pii_type == "PII:ID:UST":
                    clean_text = match.group().strip(".,;:!? \n\r\t")
                    end_pos = match.start() + len(clean_text)
                    if Validators.validate_id(clean_text, pii_type):
                        if self.ust_validator and self.ust_validator.is_valid_context(text, match.start(), end_pos):
                            is_valid = True

                elif pii_type == "PII:DRIVERPLATE":
                    clean_text = match.group().strip(".,;:!? \n\r\t")
                    end_pos = match.start() + len(clean_text)
                    if self.plate_validator and self.plate_validator.is_valid_context(text, match.start(), end_pos):
                        is_valid = True

                elif pii_type == "PII:CONTACT:PHONE":
                    clean_text = match.group().strip(".,;:!? \n\r\t")
                    end_pos = match.start() + len(clean_text)
                    if Validators.validate_phone(clean_text):
                        if self.phone_validator:
                            if self.phone_validator.is_valid_context(text, match.start(), end_pos):
                                is_valid = True
                        else: is_valid = True

                elif pii_type == "PII:ID:PASSPORT":
                    clean_text = match.group().strip(".,;:!? \n\r\t")
                    end_pos = match.start() + len(clean_text)
                    if Validators.validate_id(clean_text, pii_type):
                        if self.passport_validator and self.passport_validator.is_valid_context(text, start_pos, end_pos, clean_text):
                            is_valid = True

                elif pii_type == "PII:ID:SVN":
                    clean_text = match.group().strip(".,;:!? \n\r\t")
                    end_pos = match.start() + len(clean_text)
                    if Validators.validate_id(clean_text, pii_type):
                        if self.svn_validator:
                            if self.svn_validator.is_valid_context(text, match.start(), end_pos):
                                is_valid = True
                        else: is_valid = True

                elif pii_type == "PII:ID:NATIONAL":
                    clean_text = match.group().strip(".,;:!? \n\r\t")
                    end_pos = match.start() + len(clean_text)
                    if Validators.validate_id(clean_text, pii_type):
                        if self.national_validator:
                            if self.national_validator.is_valid_context(text, match.start(), end_pos, clean_text):
                                is_valid = True
                        else: is_valid = True

                elif pii_type == "PII:ID:TAX":
                    clean_text = match.group().strip(".,;:!? \n\r\t")
                    end_pos = match.start() + len(clean_text)
                    if Validators.validate_id(clean_text, pii_type):
                        if self.tax_validator:
                            if self.tax_validator.is_valid_context(text, start_pos, end_pos, clean_text):
                                is_valid = True
                        else: is_valid = True
                
                elif pii_type == "PII:FINANCIAL:BIC":
                    clean_text = match.group().strip(".,;:!? \n\r\t")
                    end_pos = match.start() + len(clean_text)
                    if Validators.validate_id(clean_text, pii_type):
                        if self.bic_validator:
                            if self.bic_validator.is_valid_context(text, start_pos, end_pos, clean_text):
                                is_valid = True
                        else: is_valid = True

                elif pii_type == "PII:PIN":
                    clean_text = match.group().strip(".,:; ")
                    if self.pin_validator and self.pin_validator.is_valid_context(text, match.start(), match.end(), clean_text):
                        is_valid = True

                else:
                    clean_text = match.group().strip(".,;:!? ")
                    end_pos = match.start() + len(clean_text)
                    is_valid = self._validate_match(pii_type, clean_text)

                if is_valid:
                    raw_detections.append({"type": detection_type, "text": clean_text, "start": start_pos, "end": end_pos, "confidence": 1.0})
        return self._resolve_conflicts(raw_detections)

# ==========================================
# 4. AGE EXTRACTION (CONTEXTUAL)
# ==========================================
# ==========================================
# 4. AGE & BIRTHDAY EXTRACTION (OPTIMIZED)
# ==========================================
class FastAgeExtractor:
    def __init__(self, threshold=0.65):
        self.threshold = threshold
        self.current_year = datetime.now().year
        
        # Positive anchors for Age brackets
        self.age_pos_anchors = [
            "Ich bin <NUM> Jahre alt", "Er ist <NUM> geworden", "Sie ist <NUM>", 
            "Mein Alter ist <NUM>", "Das Kind ist <NUM>", "Ein <NUM>-Jähriger", 
            "Mit <NUM> Jahren", "Alter: <NUM>", "Jahre alt", "(<NUM> Jahre)",
            "ist <NUM>", "schon <NUM>", "bin <NUM>", "bin erst <NUM>", "gerade <NUM>",
            "im Alter von <NUM>", "Sohn, <NUM> Jahre", "Tochter, <NUM> Jahre",
            "Frau Sabine (<NUM>)", "H. Goldberg (<NUM> Jahre)","bin <NUM> Jahre alt", "ist <NUM> geworden", "Alter: <NUM>", "Dame ist <NUM>",
            "bin erst <NUM>", "Tochter (<NUM>)", "Sohn, <NUM> Jahre", "bin <NUM> und kenn mich",
            "H. Goldberg (<NUM> Jahre)", "ca <NUM>, sehr schwerhörig", "bin ich <NUM> und Frühaufsteher",
            "war glaub so <NUM> oder so alt", "klang so <NUM>-<NUM> Jahre alt",
            
        ]
        
        # Positive anchors for Birthdays (DOB)
        self.dob_pos_anchors = [
            "Geboren am <NUM>", "Mein Geburtsdatum ist <NUM>", "geb. <NUM>", 
            "geb: <NUM>", "Geburtsdatum: <NUM>", "Dob: <NUM>", "Date of Birth: <NUM>",
            "geboren am", "geboren am <NUM> in", "geb <NUM>", "geb: <NUM>",
            "Geburtstag am <NUM>", "Jahrgang <NUM>", "Baujahr <NUM>", "geb. am <NUM>"
        ]
        
        # Aggressive negative anchors to block context noise (times, prices, IDs, etc.)
        self.neg_anchors = [
            "Das kostet <NUM> Euro", "Preis <NUM> EUR", "Summe <NUM>", "Betrag <NUM>",
            "Hausnummer <NUM>", "Hausnr <NUM>", "PLZ <NUM>", "Postleitzahl <NUM>",
            "um <NUM> Uhr", "gegen <NUM> Uhr", "ab <NUM> Uhr", "bis <NUM> Uhr",
            "in <NUM> Minuten", "für <NUM> tage", "seit <NUM> tagen", "vor <NUM> jahren",
            "<NUM> prozent", "<NUM> kg", "<NUM> m", "artikel <NUM>", "nummer <NUM>",
            "am <NUM>.", "vom <NUM>.", "den <NUM>.", "bis zum <NUM>.", "am <NUM> <NUM>",
            "Klasse <NUM>", "Nr. <NUM>", "Zimmer <NUM>", "Gebäude <NUM>", "Stock <NUM>", "OG <NUM>",
            "rechnungsdatum <NUM>", "bestelldatum <NUM>", "fängt am <NUM>", "termin am <NUM>",
            "am <NUM> waren Ihre Mitarbeiter", "Auftrag vom <NUM>", "war am <NUM> bei Familie",
            "Rechnung vom <NUM>", "fängt am <NUM> an", "termin am <NUM>", "bis zum <NUM>",
            "rechnungsnummer 2024-<NUM>", "offener betrag <NUM>", "den <NUM>.", "vom <NUM>.",
            # Counts & Codes
            "Hausnummer <NUM>", "Hausnr <NUM>", "PLZ <NUM>", "Postleitzahl <NUM>",
            "Betrag <NUM> Euro", "Preis <NUM> EUR", "Summe <NUM>", "<NUM> Parteien",
            "Rabatt von <NUM> %", "seit <NUM> Tagen", "vor <NUM> Jahren",
            # Communication Noise
            "um <NUM> Uhr", "gegen <NUM> Uhr", "ab <NUM> Uhr", "bis <NUM> Uhr",
            "Ticket #<NUM>", "Durchwahl -<NUM>", "unter <NUM> erreichbar", "Zimmer <NUM>"

        ]
        
        # Use ngram_range (1,3) to capture multi-word context like "geboren am"
        self.vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3))
        self.vectorizer.fit(self.age_pos_anchors + self.dob_pos_anchors + self.neg_anchors)
        
        self.age_pos_vectors = self.vectorizer.transform(self.age_pos_anchors)
        self.dob_pos_vectors = self.vectorizer.transform(self.dob_pos_anchors)
        self.neg_vectors = self.vectorizer.transform(self.neg_anchors)

    def calculate_age(self, value_str):
        # returns (age_value, is_birthday_format)
        if re.search(r'\d{1,2}[./-]\d{1,2}[./-]\d{4}', value_str):
            for fmt in ("%d.%m.%Y", "%d/%m/%Y", "%d-%m-%Y"):
                try:
                    dob = datetime.strptime(value_str, fmt)
                    age = self.current_year - dob.year - ((datetime.now().month, datetime.now().day) < (dob.month, dob.day))
                    return age, True
                except ValueError: continue
        try:
            val = int(value_str)
            # If 4 digits, assume it's a birth year contextually
            if 1900 < val <= self.current_year:
                return self.current_year - val, True
            return val, False
        except ValueError: return None, False

    def get_pii_type(self, age, is_birthday):
        if age is None or age < 0 or age > 110:
            return None
        if is_birthday: return "PII:BIRTHDAY"
        if age < 12: return "PII:AGE:CHILD"
        elif 12 <= age <= 17: return "PII:AGE:TEEN"
        elif 18 <= age <= 64: return "PII:AGE:ADULT"
        else: return "PII:AGE:SENIOR"

    def analyze_text(self, text):
        findings = []
        # specific regex for full dates or standalone 1-3 digit numbers
        for match in re.finditer(r'\b(\d{1,2}[./-]\d{1,2}[./-]\d{4}|\d{1,3})\b', text):
            cand = match.group(0)
            snip = text[max(0, match.start()-50):min(len(text), match.end()+50)].lower().replace(cand.lower(), "<NUM>")
            vec = self.vectorizer.transform([snip])
            
            age_scr = float(np.max(cosine_similarity(vec, self.age_pos_vectors)))
            dob_scr = float(np.max(cosine_similarity(vec, self.dob_pos_vectors)))
            neg_scr = float(np.max(cosine_similarity(vec, self.neg_vectors)))
            
            best_pos = max(age_scr, dob_scr)
            # High threshold to prevent generic counts (e.g. 2 children) from matching
            if best_pos > self.threshold and best_pos > neg_scr:
                age, is_date_format = self.calculate_age(cand)
                # Determine label: use PII:BIRTHDAY if dob score is higher or it looks like a full date
                label = self.get_pii_type(age, dob_scr > age_scr or is_date_format)
                if label:
                    findings.append({
                        "type": label, 
                        "text": cand, 
                        "start": match.start(), 
                        "end": match.end(), 
                        "confidence": round(best_pos, 2)
                    })
        return findings

# ==========================================
# 5. UNIFIED PIPELINE (MERGED LOGIC)
# ==========================================
class UnifiedPIIPipeline:
    def __init__(self):
        print("Loading NLP Models...")
        try: self.nlp = spacy.load("de_core_news_lg")
        except:
            from spacy.cli import download
            download("de_core_news_lg")
            self.nlp = spacy.load("de_core_news_lg")

        self.medications = ["ibuprofen", "aspirin", "paracetamol", "antibiotika", "insulin"]
        self.conditions = ["kopfschmerzen", "migräne", "fieber", "husten", "diabetes"]
        self.stoplist = {
            "Morgen", "Heute", "Gestern", "Hallo", "Hi", "Hey", "Danke", "Bitte", "Grüße", 
            "Servus", "Moin", "Tschüss", "Bis", "Bald", "Ja", "Nein", "Vielleicht", 
            "Sehr", "Geehrte", "Damen", "Herren", "Liebe", "Lieber", "Herr", "Frau", 
            "Anwalt", "Arzt", "Ärztin", "Dr.", "Prof.", "Dipl.", "Ing."
        }
        
        self.noun_blocklist = {
            # Existing technical labels
            "Perso", "Ausweis", "Pass", "Konto", "Bank", "Iban", "Nummer", "Tel", "Handy", 
            "Telefon", "Email", "Mail", "Adresse", "Name", "Büro", "Handynummer", 
            "Steuer-ID", "Glückszahl", "SV-Nr", "SVN", "BIC", "SWIFT", "sorry", "thanks", "thank you"
            
            # Relationship & Collective Terms (High FP in CSV)
            "Sohn", "Tochter", "Kind", "Enkel", "Enkelin", "Enkelkind", "Ehemann", "Ehefrau", 
            "Partner", "Partnerin", "Eltern", "Mutter", "Vater", "Bruder", "Schwester", 
            "Familie", "Ehepaar", "Fam", "Geschwister", "Freund", "Freundin", "Bekannte",
            
            # Business & Document Labels
            "Rechnungsnummer", "Kundennummer", "Steuernummer", "Steuernr", "USt-IdNr", 
            "Mitgliedsnummer", "Versicherungsnummer", "Auftragsnummer", "Führerschein", 
            "Führerschein-Nr", "Reisepass", "Passnummer", "Vertrag", "Abrechnung", 
            "Lohnabrechnung", "Girocard", "Mastercard", "Visa", "Kreditkarte",
            
            # Communication & Generic Contexts
            "Willkommensmail", "Sprachnachricht", "Chatverlauf", "Anfrage", "Antwort", 
            "Nachricht", "Zettel", "Vorlage", "Bericht", "Notiz", "Telefonnotiz", 
            "E-Mail", "Emailadresse", "Formular", "Personalbogen", "Ausschreibung",
            
            # Verbs/Nouns often capitalized at sentence start
            "Formulieren", "Formulier", "Schreiben", "Helfen", "Antworten", "Zusammenfassen", 
            "Bearbeiten", "Erfassen", "Eintragen", "Frage", "Problem", "Hilfe", "Rückruf",
            
            # Address/Location Parts
            "Allee", "Straße", "Strasse", "Weg", "Platz", "Ufer", "Wall", "Ring", "Gasse", 
            "Hof", "Garten", "Markt", "Hausnummer", "Hausnr", "PLZ", "Ort", "Stadt", 
            "Land", "Deutschland", "Berlin", "Hamburg", "München", "Köln",
            "PLZ", "Postleitzahl", "Anschrift", "Wohnhaft", "Einsatzort", "Hausnummer", "Hausnr", "Ort", "Stadt"
        }

        ruler = self.nlp.add_pipe("entity_ruler", before="ner")
        patterns = []
        for item in self.medications: patterns.append({"label": "MEDICATION", "pattern": [{"LOWER": item}]})
        for item in self.conditions: patterns.append({"label": "CONDITION", "pattern": [{"LOWER": item}]})
  
        sfx = "straße|strasse|str\.|weg|platz|allee|damm|ring|gasse|ufer|chaussee|hof|garten|markt|zeile|wall|graben|kirchweg|landstraße|pfad|autobahn|bundesstraße"
        patterns.append({"label": "ADDRESS_DETECTED", "pattern": [{"TEXT": {"REGEX": f"(?i).*({sfx})$"}}, {"TEXT": {"REGEX": r"^\d"}}]})
        patterns.append({"label": "POSTALCODE_DETECTED", "pattern": [{"LOWER": "plz"}, {"TEXT": {"REGEX": r"^\d{5}$"}}]})
        ruler.add_patterns(patterns)

        self.matcher = DependencyMatcher(self.nlp.vocab)
        self.age_extractor = FastAgeExtractor(threshold=0.65)
        self.iban_context_validator = IBANContextValidator()
        self.card_context_validator = CardContextValidator()
        self.phone_context_validator = PhoneContextValidator()
        self.passport_context_validator = PassportContextValidator()
        self.svn_context_validator = SVNContextValidator()
        self.national_validator = NationalContextValidator()
        self.tax_context_validator = TaxContextValidator()
        self.bic_context_validator = BICContextValidator()
        # Initializing Added Validators
        self.insurance_validator = InsuranceContextValidator()
        self.license_validator = DriverLicenseContextValidator()
        self.plate_validator = DriverPlateContextValidator()
        self.ust_validator = USTContextValidator()
        self.pin_context_validator = PINContextValidator()
        
        self.regex_detector = RegexPIIDetector(
            iban_validator=self.iban_context_validator, 
            card_validator=self.card_context_validator,
            phone_validator=self.phone_context_validator,
            passport_validator=self.passport_context_validator,
            svn_validator=self.svn_context_validator,
            national_validator=self.national_validator,
            tax_validator=self.tax_context_validator,
            bic_validator=self.bic_context_validator,
            # Passing Added Validators
            insurance_validator=self.insurance_validator,
            license_validator=self.license_validator,
            plate_validator=self.plate_validator,
            ust_validator=self.ust_validator,
            pin_validator=self.pin_context_validator
        )


    def _trim_entity_by_pos(self, ent):
        """Uses SpaCy POS tags to remove leading Verbs, Pronouns, and Prepositions."""
        start = 0
        invalid_start_pos = {"PRON", "VERB", "ADP", "DET", "AUX", "PART", "ADV"}
        
        for token in ent:
            # If word is a 'noise' grammatical type or in your blocklist, skip it
            if token.pos_ in invalid_start_pos or token.lower_ in Validators.LOCATION_CONTEXT_WORDS:
                start += 1
            else:
                break
        
        if start >= len(ent): return None
        return ent[start:]

    # --- ADD TO UnifiedPIIPipeline CLASS ---
    def _clean_location_text(self, text: str) -> str:


        if "\n" in text:
            # We only want the line containing the actual street suffix
            suffixes_pattern = r"(straße|strasse|str\.|weg|platz|allee|damm|ring|gasse|ufer|chaussee|hof|garten|markt|zeile|wall|graben|kirchweg|landstraße|pfad|autobahn|bundesstraße)"
            lines = text.split("\n")
            for line in lines:
                if re.search(suffixes_pattern, line, re.IGNORECASE):
                    text = line
                    break
        """Aggressively prunes sentence context from the start of a location match."""
        # Common German street suffixes to anchor on
        suffixes = r"(straße|strasse|str\.|weg|platz|allee|damm|ring|gasse|ufer|chaussee|hof|garten|markt|zeile|wall|graben|kirchweg|landstraße|pfad|autobahn|bundesstraße)"
        match = re.search(suffixes, text, re.IGNORECASE)
        
        if not match:
            return text.strip(" \n\r\t.,:;-_")

        suffix_start = match.start()
        # Look at the text before the suffix
        pre_suffix = text[:suffix_start]
        words = re.split(r'(\s+)', pre_suffix)
        
        # Work backwards from the suffix to find the start of the actual street name
        clean_words = []
        # We typically want 1-2 words before the suffix (e.g., "Theodor Heuss" Allee)
        # but we stop if we hit a 'context word' (like 'wohnt')
        for i in range(len(words)-1, -1, -1):
            word_clean = words[i].strip(" \n\r\t.,:;-_").lower()
            if not word_clean:
                clean_words.insert(0, words[i])
                continue
            if word_clean in Validators.LOCATION_CONTEXT_WORDS or word_clean in self.noun_blocklist:
                break
            clean_words.insert(0, words[i])
            # Limit to 3 words before suffix to avoid grabbing the whole sentence
            if len([w for w in clean_words if w.strip()]) >= 3:
                break
                
        # Join the kept words with the rest of the string (suffix + house number)
        result = "".join(clean_words) + text[suffix_start:]
        return result.strip(" \n\r\t.,:;-_")



    def _generate_token(self, pii_type, text_segment):
        h = hashlib.md5(text_segment.lower().encode()).hexdigest()[:8]
        return f"[PII:{pii_type.replace(':', '_')}_ID_{h}]"

    def _analyze_person_entity(self, ent):
        role_keywords = {"Dr.", "Prof.", "Arzt", "Ärztin", "Herr", "Frau", "Anwalt"}
        detected_role = "N/A"
        clean_name_parts = []
        for token in ent:
            if token.text in role_keywords: detected_role = token.text
            else: clean_name_parts.append(token.text)
        if detected_role == "N/A" and ent.start > 0:
            prev_token = ent.doc[ent.start - 1]
            if prev_token.text in role_keywords: detected_role = prev_token.text
        clean_name = re.sub(r"[^\w\s-]", "", " ".join(clean_name_parts).strip())
        return clean_name or ent.text, detected_role

    def process_batch(self, text_list: List[str]) -> List[Dict[str, Any]]:
        results = []
        for original_text in text_list:
            start_time = time.time()
            all_findings = []
            occupied = set()

            regex_detections = self.regex_detector.detect(original_text)
            for f in regex_detections:
                if f['type'] == "PII:LOCATION:ADDRESS":
                    # Apply aggressive cleaning to the regex result
                    cleaned = self._clean_location_text(f['text'])
                    if not cleaned: continue
                    
                    # Recalculate positions based on the cleaned string
                    offset = f['text'].find(cleaned)
                    f['text'] = cleaned
                    f['start'] += offset
                    f['end'] = f['start'] + len(cleaned)
                
                # Double-check span isn't already occupied (prevents duplicate overlaps)
                if any(i in occupied for i in range(f['start'], f['end'])):
                    continue

                all_findings.append(f)
                occupied.update(range(f['start'], f['end']))

            for f in self.age_extractor.analyze_text(original_text):
                if not any(i in occupied for i in range(f['start'], f['end'])):
                    all_findings.append(f); occupied.update(range(f['start'], f['end']))

            doc = self.nlp(original_text)
            for ent in doc.ents:
                if any(i in occupied for i in range(ent.start_char, ent.end_char)): continue
                label, txt = ent.label_, ent.text
                if label in ["MEDICATION", "CONDITION"]:
                    continue
                    all_findings.append({"type": f"MED:{label}", "text": txt, "start": ent.start_char, "end": ent.end_char, "confidence": 0.9})
                    occupied.update(range(ent.start_char, ent.end_char))

                if label in ["LOC", "GPE", "ADDRESS_DETECTED", "POSTALCODE_DETECTED"]:
                    # TRIM BY POS (The "Stronger" part)
                    trimmed = self._trim_entity_by_pos(ent)
                    if not trimmed: continue
                    
                    clean_txt = trimmed.text.strip(" \n\r\t.,:;-_")
                    s_char, e_char = trimmed.start_char, trimmed.end_char
                    plz_match = re.search(r'\b\d{5}\b', clean_txt)
                    
                    if label == "POSTALCODE_DETECTED" or re.fullmatch(r"\d{5}", clean_txt):
                        all_findings.append({"type": "PII:LOCATION:POSTALCODE", "text": clean_txt, "start": s_char, "end": e_char, "confidence": 0.98})
                    elif label == "ADDRESS_DETECTED" or re.search(r"\d", clean_txt):
                        if plz_match and plz_match.group() != clean_txt:
                            # SPLIT: Address and Postal Code
                            plz_val = plz_match.group()
                            plz_start = s_char + clean_txt.find(plz_val)
                            all_findings.append({"type": "PII:LOCATION:POSTALCODE", "text": plz_val, "start": plz_start, "end": plz_start + 5, "confidence": 0.98})
                            
                            addr_part = clean_txt.replace(plz_val, "").strip(" ,")
                            if addr_part:
                                all_findings.append({"type": "PII:LOCATION:ADDRESS", "text": addr_part, "start": s_char, "end": s_char + len(addr_part), "confidence": 0.95})
                        else:
                            all_findings.append({"type": "PII:LOCATION:ADDRESS", "text": clean_txt, "start": s_char, "end": e_char, "confidence": 0.95})
                    occupied.update(range(s_char, e_char))

                    
                elif label in ["PER", "PER_STRONG"]:
                    clean_name, role = self._analyze_person_entity(ent)
                    if clean_name.title() not in self.stoplist and not any(p.strip().title() in self.noun_blocklist for p in clean_name.split()):
                        all_findings.append({"type": "PII:PERSON", "text": txt, "start": ent.start_char, "end": ent.end_char, "confidence": 0.9})
                        occupied.update(range(ent.start_char, ent.end_char))

            all_findings.sort(key=lambda x: x['start'], reverse=True)
            masked_text = original_text
            for f in all_findings:
                token = self._generate_token(f['type'], f['text'])
                masked_text = masked_text[:f['start']] + token + masked_text[f['end']:]
                f['token'] = token

            results.append({"has_pii": len(all_findings) > 0, "detections": all_findings[::-1], "anonymized_text": masked_text, "processing_time_ms": int((time.time() - start_time) * 1000)})
        return results

# ==========================================
# 6. EXECUTION
# ==========================================
if __name__ == "__main__":
    pipeline = UnifiedPIIPipeline()
    samples = [
        "BIC: GIBACZPX",
        "Kowalski wechselt zur Barmer.", 
        "USt-IdNr. DE287654321",
        "Führerschein AA3EFB51059",
        "Das Kennzeichen ist DO-RB 472",
        "Überweisung an Techniker Krankenkasse, SWIFT: NOLADE21KIE",
        "gültig bis 03/28, cvv 847 anzahlung für badezimmer-renovierung: 500€"
    ]
    output = pipeline.process_batch(samples)
    print(json.dumps(output, indent=4, ensure_ascii=False))

Loading NLP Models...
[
    {
        "has_pii": true,
        "detections": [
            {
                "type": "PII:FINANCIAL:BIC",
                "text": "GIBACZPX",
                "start": 5,
                "end": 13,
                "confidence": 1.0,
                "token": "[PII:PII_FINANCIAL_BIC_ID_7c6cd69d]"
            }
        ],
        "anonymized_text": "BIC: [PII:PII_FINANCIAL_BIC_ID_7c6cd69d]",
        "processing_time_ms": 24
    },
    {
        "has_pii": true,
        "detections": [
            {
                "type": "PII:PERSON",
                "text": "Kowalski",
                "start": 0,
                "end": 8,
                "confidence": 0.9,
                "token": "[PII:PII_PERSON_ID_a8393058]"
            }
        ],
        "anonymized_text": "[PII:PII_PERSON_ID_a8393058] wechselt zur Barmer.",
        "processing_time_ms": 6
    },
    {
        "has_pii": true,
        "detections": [
            {
                "type": "PII:ID:UST"