In [1]:
# Cell 1: Import Libraries and Setup

# Import necessary libraries
import pandas as pd
import numpy as np
import re
import string
import time
import os
import warnings
from collections import defaultdict
import torch
from Levenshtein import distance as levenshtein_distance
from Levenshtein import jaro_winkler, ratio as levenshtein_ratio
import textdistance
from fuzzywuzzy import fuzz
import jellyfish
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Import transformers for BERT embeddings
try:
    from transformers import AutoTokenizer, AutoModel
    transformers_available = True
    print("Transformers library available for BERT embeddings")
except ImportError:
    transformers_available = False
    print("Warning: transformers library not available. Will use TF-IDF fallback.")

# Try to import pyahocorasick with fallback
try:
    import pyahocorasick
    aho_corasick_available = True
    print("pyahocorasick is available")
except ImportError:
    print("Warning: pyahocorasick not available. Using fallback implementation.")
    aho_corasick_available = False

# Suppress warnings
warnings.filterwarnings('ignore')

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

print("All libraries imported successfully!")

Transformers library available for BERT embeddings
Using device: cpu
All libraries imported successfully!


In [3]:
# Cell 2: BERT Embeddings Class

class BERTEmbedder:
    """
    Class for generating BERT embeddings for text data using pre-trained models.
    Implements pooling strategies and batching for efficiency.
    """
    
    def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2', pooling_strategy='mean', device=None):
        """
        Initialize BERT embedder with specified pre-trained model and pooling strategy.
        
        Args:
            model_name (str): Name of the pre-trained BERT model to use
            pooling_strategy (str): Pooling strategy ('mean', 'cls', or 'max')
            device: Device to run the model on (cuda or cpu)
        """
        self.model_name = model_name
        self.pooling_strategy = pooling_strategy
        self.max_sequence_length = 512  # BERT's limit
        
        if device is None:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = device
            
        self.initialized = False
        
        # Initialize pre-trained model if transformers available
        if transformers_available:
            try:
                print(f"Loading pre-trained BERT model '{model_name}'...")
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.model = AutoModel.from_pretrained(model_name).to(self.device)
                self.model.eval()  # Set to evaluation mode
                self.initialized = True
                print(f"Pre-trained BERT model loaded successfully on {self.device}")
            except Exception as e:
                print(f"Error initializing BERT model: {e}")
                self.initialized = False
        
        # Initialize TF-IDF fallback if BERT not available
        if not self.initialized:
            self.tfidf_vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 4))
            self.tfidf_fitted = False
            print("Using TF-IDF fallback for embeddings")
    
    def _mean_pooling(self, model_output, attention_mask):
        """
        Mean pooling - take average of all token embeddings
        """
        token_embeddings = model_output[0]  # First element contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    def _cls_pooling(self, model_output, attention_mask):
        """
        CLS pooling - use the [CLS] token embedding
        """
        return model_output[0][:, 0]
    
    def _max_pooling(self, model_output, attention_mask):
        """
        Max pooling - take max of all token embeddings
        """
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
        return torch.max(token_embeddings, 1)[0]
    
    def _get_pooled_embeddings(self, model_output, attention_mask):
        """
        Apply the selected pooling strategy
        """
        if self.pooling_strategy == 'mean':
            return self._mean_pooling(model_output, attention_mask)
        elif self.pooling_strategy == 'cls':
            return self._cls_pooling(model_output, attention_mask)
        elif self.pooling_strategy == 'max':
            return self._max_pooling(model_output, attention_mask)
        else:
            # Default to mean pooling
            return self._mean_pooling(model_output, attention_mask)
    
    def fit(self, texts):
        """
        Fit the TF-IDF vectorizer on a corpus of texts (only needed for TF-IDF fallback)
        """
        if not self.initialized:
            # Fit TF-IDF vectorizer
            self.tfidf_vectorizer.fit(texts)
            self.tfidf_fitted = True
            print("TF-IDF vectorizer fitted on corpus")
    
    def encode(self, texts, batch_size=32, show_progress=False):
        """
        Encode texts into embeddings using the pre-trained model
        
        Args:
            texts: List of texts or single text
            batch_size: Batch size for processing
            show_progress: Whether to show progress
            
        Returns:
            numpy.ndarray: Embeddings for the texts
        """
        # Handle single text input
        if isinstance(texts, str):
            texts = [texts]
        
        # Return empty array for empty input
        if len(texts) == 0:
            return np.array([])
        
        # Use pre-trained BERT if available
        if self.initialized:
            # Process in batches
            all_embeddings = []
            
            for i in range(0, len(texts), batch_size):
                if show_progress and i % (batch_size * 10) == 0:
                    print(f"Processing batch {i//batch_size + 1}/{(len(texts)//batch_size) + 1}")
                
                batch_texts = texts[i:i+batch_size]
                
                # Tokenize
                encoded_input = self.tokenizer(
                    batch_texts, 
                    padding=True, 
                    truncation=True, 
                    max_length=self.max_sequence_length,
                    return_tensors='pt'
                ).to(self.device)
                
                # Compute token embeddings
                with torch.no_grad():
                    model_output = self.model(**encoded_input)
                    batch_embeddings = self._get_pooled_embeddings(model_output, encoded_input['attention_mask'])
                    all_embeddings.append(batch_embeddings.cpu().numpy())
            
            return np.vstack(all_embeddings)
        
        else:
            # Use TF-IDF fallback
            if not self.tfidf_fitted:
                self.fit(texts)
            
            return self.tfidf_vectorizer.transform(texts).toarray()
    
    def compute_similarity(self, text1, text2):
        """
        Compute cosine similarity between two texts using the pre-trained model
        
        Args:
            text1: First text
            text2: Second text
            
        Returns:
            float: Cosine similarity score
        """
        # Get embeddings for both texts
        emb1 = self.encode(text1)
        emb2 = self.encode(text2)
        
        # Compute cosine similarity
        return np.sum(emb1 * emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2) + 1e-8)

# Initialize BERT embedder with pre-trained model
bert_embedder = BERTEmbedder(model_name='sentence-transformers/all-MiniLM-L6-v2', device=device)
print("BERT embedder initialized with pre-trained model!")

Loading pre-trained BERT model 'sentence-transformers/all-MiniLM-L6-v2'...


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Pre-trained BERT model loaded successfully on cpu
BERT embedder initialized with pre-trained model!


In [5]:
# Cell 3: Enhanced Merchant Matcher with Pre-trained Models

class EnhancedMerchantMatcher:
    """
    Enhanced matcher using pre-trained models and rule-based scoring
    for merchant name and acronym matching.
    """
    
    def __init__(self, bert_embedder=None):
        """
        Initialize with optional pre-trained BERT embedder.
        
        Args:
            bert_embedder: Pre-trained BERT embedder instance
        """
        # Initialize pre-trained BERT embedder
        self.bert_embedder = bert_embedder
        if self.bert_embedder is None and transformers_available:
            self.bert_embedder = BERTEmbedder()
        
        # Initialize TF-IDF vectorizer
        self.tfidf_vectorizer = TfidfVectorizer()
        
        # Initialize trie for approximate matching
        self.trie = None
        
        # Initialize Aho-Corasick automaton only if available
        if aho_corasick_available:
            self.automaton = pyahocorasick.Automaton()
        else:
            self.automaton = None
        
        # Define abbreviation dictionary - comprehensive industry knowledge 
        self.abbreviations = {
            # Banking & Financial Institutions
            'bofa': 'bank of america', 'b of a': 'bank of america',
            'boa': 'bank of america', 'bac': 'bank of america',
            'jpm': 'jpmorgan chase', 'jpm chase': 'jpmorgan chase',
            'wf': 'wells fargo', 'wfb': 'wells fargo bank',
            'citi': 'citibank', 'citi bank': 'citibank',
            'gs': 'goldman sachs', 'ms': 'morgan stanley',
            'db': 'deutsche bank', 'hsbc': 'hongkong and shanghai banking corporation',
            'amex': 'american express', 'usb': 'us bank', 'rbc': 'royal bank of canada',
            'pnc': 'pnc financial services', 'td': 'toronto dominion bank',
            'bny': 'bank of new york', 'bnyc': 'bank of new york mellon',
            
            # Fast Food & Restaurant Chains
            'mcd': 'mcdonalds', 'mcds': 'mcdonalds', 'md': 'mcdonalds',
            'bk': 'burger king', 'kfc': 'kentucky fried chicken',
            'sbux': 'starbucks', 'sb': 'starbucks',
            'tb': 'taco bell', 'wen': 'wendys',
            'dq': 'dairy queen', 'ph': 'pizza hut',
            'dnkn': 'dunkin donuts', 'cfa': 'chick fil a',
            'cmg': 'chipotle mexican grill', 'ihop': 'international house of pancakes',
            'tgi': 'tgi fridays', 'tgif': 'tgi fridays',
            
            # Tech Companies
            'msft': 'microsoft', 'aapl': 'apple', 'goog': 'google',
            'googl': 'google', 'amzn': 'amazon', 'fb': 'facebook',
            'meta': 'meta platforms', 'nflx': 'netflix', 'tsla': 'tesla',
            'ibm': 'international business machines', 'csco': 'cisco systems',
            'orcl': 'oracle', 'intc': 'intel', 'amd': 'advanced micro devices',
            'nvda': 'nvidia', 'adbe': 'adobe', 'crm': 'salesforce',
            
            # Automotive
            'tm': 'toyota motor', 'toyof': 'toyota', 'toyota': 'toyota corporation',
            'f': 'ford motor company', 'gm': 'general motors',
            'hmc': 'honda motor company', 'hndaf': 'honda',
            'nsany': 'nissan', 'bmwyy': 'bmw', 'vwagy': 'volkswagen',
            
            # Retail companies
            'wmt': 'walmart', 'tgt': 'target', 'cost': 'costco',
            'hd': 'home depot', 'low': 'lowes', 'bby': 'best buy',
            'ebay': 'ebay', 'dg': 'dollar general', 'dltr': 'dollar tree',
            
            # Common abbreviations
            'j&j': 'johnson & johnson', 'jj': 'johnson johnson', 
            'jnj': 'johnson and johnson', '7-11': '7-eleven', 
            '711': '7-eleven', 'intl': 'international',
            'corp': 'corporation', 'inc': 'incorporated',
            
            # Address components
            'rd': 'road', 'st': 'street', 'ave': 'avenue', 
            'blvd': 'boulevard', 'ctr': 'center', 'ln': 'lane', 
            'dr': 'drive', 'pl': 'place', 'ct': 'court',
            'hwy': 'highway', 'pkwy': 'parkway', 'sq': 'square'
        }
        
        # Domain-specific abbreviation dictionaries
        self.domain_abbreviations = {
            'Medical': {
                'dr': 'doctor', 'hosp': 'hospital', 'med': 'medical',
                'clin': 'clinic', 'pharm': 'pharmacy', 'lab': 'laboratory',
                'dept': 'department', 'ctr': 'center', 'inst': 'institute'
            },
            'Government': {
                'govt': 'government', 'dept': 'department', 'admin': 'administration',
                'auth': 'authority', 'fed': 'federal', 'natl': 'national',
                'comm': 'commission', 'sec': 'secretary', 'org': 'organization'
            },
            'Education': {
                'univ': 'university', 'coll': 'college', 'acad': 'academy',
                'elem': 'elementary', 'sch': 'school', 'inst': 'institute',
                'dept': 'department', 'lib': 'library', 'lab': 'laboratory'
            },
            'Financial': {
                'fin': 'financial', 'svcs': 'services', 'mgmt': 'management',
                'assoc': 'associates', 'intl': 'international', 'grp': 'group',
                'corp': 'corporation', 'cap': 'capital', 'inv': 'investment'
            }
        }
        
        # Stop words to remove during preprocessing
        self.stopwords = {
            'inc', 'llc', 'co', 'ltd', 'corp', 'plc', 'na', 'the', 
            'and', 'of', 'for', 'in', 'a', 'an', 'by', 'to', 'at',
            'corporation', 'incorporated', 'company', 'limited'
        }
        
        # Domain-specific stopwords
        self.domain_stopwords = {
            'Medical': {'center', 'healthcare', 'medical', 'health', 'care', 'services'},
            'Government': {'department', 'office', 'agency', 'bureau', 'division'},
            'Education': {'university', 'college', 'school', 'institute', 'academy'},
            'Financial': {'financial', 'services', 'management', 'capital', 'investment'}
        }
        
        # Pre-defined domain weights for similarity algorithms
        self.domain_weights = self._get_domain_weights()
    
    def _get_domain_weights(self):
        """Get pre-defined domain weights for each similarity algorithm"""
        return {
            'default': {
                'jaro_winkler': 0.10,
                'damerau_levenshtein': 0.05,
                'tfidf_cosine': 0.05,
                'jaccard_bigram': 0.05,
                'soundex': 0.05,
                'token_sort_ratio': 0.10,
                'contains_ratio': 0.10,
                'fuzzy_levenshtein': 0.05,
                'trie_approximate': 0.10,
                'bert_similarity': 0.15,
                'aho_corasick': 0.05,
                'acronym_formation': 0.15
            },
            'Restaurant': {
                'acronym_formation': 0.20,
                'bert_similarity': 0.15,
                'token_sort_ratio': 0.12,
                'contains_ratio': 0.12
            },
            'Banking': {
                'acronym_formation': 0.18,
                'bert_similarity': 0.15,
                'trie_approximate': 0.12
            },
            'Automotive': {
                'acronym_formation': 0.18,
                'contains_ratio': 0.15,
                'bert_similarity': 0.12
            },
            'Medical': {
                'soundex': 0.12,
                'bert_similarity': 0.15,
                'acronym_formation': 0.15
            },
            'Government': {
                'trie_approximate': 0.15,
                'acronym_formation': 0.18,
                'token_sort_ratio': 0.12
            },
            'Technology': {
                'bert_similarity': 0.15,
                'acronym_formation': 0.15,
                'tfidf_cosine': 0.12
            },
            'Education': {
                'bert_similarity': 0.15,
                'acronym_formation': 0.15,
                'token_sort_ratio': 0.12
            },
            'Retail': {
                'bert_similarity': 0.12,
                'token_sort_ratio': 0.12,
                'acronym_formation': 0.15
            },
            'Financial': {
                'bert_similarity': 0.15,
                'acronym_formation': 0.15,
                'token_sort_ratio': 0.10
            }
        }
    
    def preprocess_with_domain(self, text, domain=None):
        """
        Preprocesses text with domain-specific handling
        
        Args:
            text (str): Text to preprocess
            domain (str, optional): Domain for specialized preprocessing
            
        Returns:
            str: Preprocessed text
        """
        if not isinstance(text, str):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove punctuation
        text = re.sub(f'[{re.escape(string.punctuation)}]', ' ', text)
        
        # Replace abbreviations
        words = text.split()
        
        # Apply general abbreviation expansion
        words = [self.abbreviations.get(word, word) for word in words]
        
        # Apply domain-specific abbreviation expansion if domain is provided
        if domain and domain in self.domain_abbreviations:
            words = [self.domain_abbreviations[domain].get(word, word) for word in words]
        
        # Remove general stopwords
        words = [word for word in words if word not in self.stopwords]
        
        # Remove domain-specific stopwords if domain is provided
        if domain and domain in self.domain_stopwords:
            words = [word for word in words if word not in self.domain_stopwords[domain]]
        
        # Rejoin words and remove extra spaces
        text = ' '.join(words)
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def preprocess_pair(self, acronym, full_name, domain=None):
        """Preprocess acronym and full name with domain-specific handling"""
        acronym_clean = self.preprocess_with_domain(acronym, domain)
        full_name_clean = self.preprocess_with_domain(full_name, domain)
        return acronym_clean, full_name_clean
    
    def jaro_winkler_similarity(self, acronym, full_name, domain=None):
        """Calculate Jaro-Winkler similarity"""
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        # Check if strings are empty
        if not acronym_clean or not full_name_clean:
            return 0
        return jaro_winkler(acronym_clean, full_name_clean)
    
    def damerau_levenshtein_similarity(self, acronym, full_name, domain=None):
        """Calculate Damerau-Levenshtein similarity"""
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        # Check if strings are empty
        if not acronym_clean or not full_name_clean:
            return 0
        
        # Calculate Damerau-Levenshtein distance
        max_len = max(len(acronym_clean), len(full_name_clean))
        if max_len == 0:
            return 0
        
        distance = textdistance.damerau_levenshtein.distance(acronym_clean, full_name_clean)
        similarity = 1 - (distance / max_len)
        return max(0, similarity)  # Ensure non-negative
    
    def tfidf_cosine_similarity(self, acronym, full_name, domain=None):
        """Calculate TF-IDF Cosine similarity"""
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        # Check if strings are empty
        if not acronym_clean or not full_name_clean:
            return 0
        
        # Fit and transform with TF-IDF
        try:
            tfidf_matrix = self.tfidf_vectorizer.fit_transform([acronym_clean, full_name_clean])
            similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
            return max(0, similarity)  # Ensure non-negative
        except:
            return 0
    
    def jaccard_bigram_similarity(self, acronym, full_name, domain=None):
        """Calculate Jaccard Bigram similarity"""
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        # Check if strings are empty
        if not acronym_clean or not full_name_clean:
            return 0
        
        # Create bigrams
        def get_bigrams(text):
            return [text[i:i+2] for i in range(len(text)-1)]
        
        acronym_bigrams = set(get_bigrams(acronym_clean))
        full_name_bigrams = set(get_bigrams(full_name_clean))
        
        # Calculate Jaccard similarity
        union_size = len(acronym_bigrams.union(full_name_bigrams))
        if union_size == 0:
            return 0
        
        intersection_size = len(acronym_bigrams.intersection(full_name_bigrams))
        return intersection_size / union_size
    
    def soundex_similarity(self, acronym, full_name, domain=None):
        """
        Calculate phonetic similarity using Soundex algorithm.
        """
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        # If either string is empty, return 0
        if not acronym_clean or not full_name_clean:
            return 0.0
        
        # Get the soundex codes for both strings
        try:
            # For multi-word strings, get soundex for each word
            acronym_words = acronym_clean.split()
            full_name_words = full_name_clean.split()
            
            # Get soundex codes for each word
            acronym_codes = [jellyfish.soundex(word) for word in acronym_words]
            full_name_codes = [jellyfish.soundex(word) for word in full_name_words]
            
            # Calculate matches between codes
            matches = 0
            total = max(len(acronym_codes), len(full_name_codes))
            
            for code in acronym_codes:
                if code in full_name_codes:
                    matches += 1
                    # Remove the matched code to avoid double counting
                    full_name_codes.remove(code)
            
            return matches / total if total > 0 else 0.0
        except:
            # Fallback if there's an error with the soundex calculation
            return 0.0
    
    def token_sort_ratio_similarity(self, acronym, full_name, domain=None):
        """Calculate Token Sort Ratio using fuzzywuzzy"""
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        # Check if strings are empty
        if not acronym_clean or not full_name_clean:
            return 0
        
        # Calculate Token Sort Ratio
        ratio = fuzz.token_sort_ratio(acronym_clean, full_name_clean) / 100
        return ratio
    
    def contains_ratio_similarity(self, acronym, full_name, domain=None):
        """Check if acronym is contained in full name"""
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        # Check if strings are empty
        if not acronym_clean or not full_name_clean:
            return 0
        
        # Check if acronym is contained in full name
        if acronym_clean in full_name_clean:
            return 1
        
        # Check for partial containment
        acronym_chars = list(acronym_clean)
        full_name_chars = list(full_name_clean)
        
        matches = 0
        for char in acronym_chars:
            if char in full_name_chars:
                matches += 1
                full_name_chars.remove(char)  # Remove matched char
        
        if len(acronym_chars) == 0:
            return 0
        
        return matches / len(acronym_chars)
    
    def fuzzy_levenshtein_similarity(self, acronym, full_name, domain=None):
        """Calculate fuzzy Levenshtein ratio"""
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        # Check if strings are empty
        if not acronym_clean or not full_name_clean:
            return 0
        
        # Calculate Levenshtein ratio (which is already normalized)
        similarity = levenshtein_ratio(acronym_clean, full_name_clean)
        return similarity
    
    def trie_approximate_similarity(self, acronym, full_name, domain=None):
        """Use a trie for approximate matching"""
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        # Check if strings are empty
        if not acronym_clean or not full_name_clean:
            return 0
        
        # Extract first letters from each word in full name
        words = full_name_clean.split()
        if not words:
            return 0
        
        first_letters = ''.join([word[0] for word in words if word])
        
        # Check if acronym matches first letters
        if acronym_clean.lower() == first_letters.lower():
            return 1
        
        # Calculate similarity for approximate matching
        max_len = max(len(acronym_clean), len(first_letters))
        if max_len == 0:
            return 0
        
        distance = levenshtein_distance(acronym_clean.lower(), first_letters.lower())
        similarity = 1 - (distance / max_len)
        return max(0, similarity)
    
    def aho_corasick_similarity(self, acronym, full_name, domain=None):
        """Use Aho-Corasick algorithm for pattern matching"""
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        # Check if strings are empty
        if not acronym_clean or not full_name_clean:
            return 0
        
        if not aho_corasick_available:
            # Fallback implementation when pyahocorasick is not available
            matches = 0
            for c in acronym_clean:
                if c in full_name_clean:
                    matches += 1
                    # Remove matched character to prevent duplicate counting
                    full_name_clean = full_name_clean.replace(c, '', 1)
            
            return min(1.0, matches / len(acronym_clean)) if len(acronym_clean) > 0 else 0
        
        # Build automaton
        automaton = pyahocorasick.Automaton()
        for i, c in enumerate(acronym_clean):
            automaton.add_word(c, (i, c))
        automaton.make_automaton()
        
        # Find matches
        matches = 0
        for _, (_, c) in automaton.iter(full_name_clean):
            matches += 1
        
        # Calculate score
        if len(acronym_clean) == 0:
            return 0
        
        return min(1.0, matches / len(acronym_clean))
    
    def bert_similarity(self, acronym, full_name, domain=None):
        """
        Calculate semantic similarity using pre-trained BERT embeddings
        
        Args:
            acronym (str): The acronym to match
            full_name (str): The full name to match against
            domain (str, optional): Domain for specialized preprocessing
            
        Returns:
            float: BERT similarity score between 0 and 1
        """
        # If BERT embedder is not initialized, return 0
        if self.bert_embedder is None:
            return 0
        
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        
        # Check if strings are empty
        if not acronym_clean or not full_name_clean:
            return 0
        
        try:
            # Get embeddings from pre-trained model
            emb1 = self.bert_embedder.encode(acronym_clean)
            emb2 = self.bert_embedder.encode(full_name_clean)
            
            # Calculate cosine similarity
            similarity = np.sum(emb1 * emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2) + 1e-8)
            return float(similarity)
        except Exception as e:
            print(f"Error in BERT similarity calculation: {e}")
            return 0
    
    def acronym_formation_score(self, acronym, full_name, domain=None):
        """Calculate how well the acronym is formed from the full name"""
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        # Check if strings are empty
        if not acronym_clean or not full_name_clean:
            return 0
        
        # Extract first letters from each word in full name
        words = full_name_clean.split()
        if not words:
            return 0
        
        # Standard acronym formation - first letter of each word
        first_letters = ''.join([word[0] for word in words if word])
        
        # If exact match, return 1
        if acronym_clean.lower() == first_letters.lower():
            return 1
        
        # Check partial match
        acronym_chars = list(acronym_clean.lower())
        first_letters_chars = list(first_letters.lower())
        
        matches = 0
        for char in acronym_chars:
            if char in first_letters_chars:
                matches += 1
                first_letters_chars.remove(char)  # Remove matched char
        
        if len(acronym_chars) == 0:
            return 0
        
        # Calculate partial match score
        return matches / len(acronym_chars)
    
    def enhanced_acronym_formation_score(self, acronym, full_name, domain=None):
        """
        Enhanced acronym formation score with special handling for common patterns
        particularly optimized for restaurant chains and business names with prefixes like "Mc".
        
        Args:
            acronym (str): The acronym to evaluate
            full_name (str): The full name to match against
            domain (str, optional): Domain for specialized preprocessing
            
        Returns:
            float: A score between 0 and 1 indicating how well the acronym matches the full name
        """
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        
        # Basic cleanup
        acronym = acronym_clean.lower()
        full_name = full_name_clean.lower()
        
        # Special case for "Mc" prefixes (common in restaurant names)
        if full_name.startswith('mc') and len(acronym) >= 1 and acronym[0] == 'm':
            # McDonalds -> MCD pattern
            modified_full_name = full_name[2:]  # Remove "mc"
            remaining_chars = acronym[1:]  # Remove "m"
            
            # For "MCD" -> "McDonalds" pattern
            if remaining_chars and len(modified_full_name) > 0:
                # Check if remaining chars match consonants in the name
                consonants = ''.join([c for c in modified_full_name if c not in 'aeiou'])
                if remaining_chars in consonants:
                    return 0.95
                
                # Check if first few consonants match remaining chars
                first_consonants = ''.join([c for c in modified_full_name[:len(remaining_chars)*2] 
                                          if c not in 'aeiou'])
                if remaining_chars in first_consonants:
                    return 0.90
                
                # Check first letters after "Mc"
                words = modified_full_name.split()
                if words:
                    first_letters = ''.join([word[0] for word in words if word])
                    if remaining_chars in first_letters:
                        return 0.90
                    
                    # Check if remaining chars appear in sequence in the words
                    current_word_position = 0
                    chars_found = 0
                    for char in remaining_chars:
                        for i in range(current_word_position, len(words)):
                            if char in words[i]:
                                chars_found += 1
                                current_word_position = i + 1
                                break
                    
                    if chars_found == len(remaining_chars):
                        return 0.85
            
            # Even if not a perfect match, it's still a good score for Mc prefix
            return 0.80
        
        # Check for brand name with location prefix/suffix pattern (Toyota Corporation -> Western Toyota)
        common_brands = ['toyota', 'ford', 'honda', 'bmw', 'walmart', 'target', 'starbucks']
        location_prefixes = ['north', 'south', 'east', 'west', 'western', 'eastern', 'central']
        
        # Extract the key brand name (if present)
        brand_match = None
        for brand in common_brands:
            if brand in acronym.lower():
                brand_match = brand
                break
            if brand in full_name.lower():
                brand_match = brand
                break
        
        if brand_match:
            # Check if one name has the brand with a location prefix/suffix and the other has just the brand
            has_location_prefix = any(prefix in acronym.lower() or prefix in full_name.lower() 
                                     for prefix in location_prefixes)
            
            if has_location_prefix:
                # If both contain the brand name but one has location prefix
                if brand_match in acronym.lower() and brand_match in full_name.lower():
                    return 0.92
        
        # Standard acronym formation - first letter of each word
        words = full_name.split()
        if not words:
            return 0
        
        # Get first letters
        first_letters = ''.join([word[0] for word in words if word])
        
        # If exact match, return high score
        if acronym == first_letters:
            return 1.0
        
        # Check for consonant-based acronym (common in business acronyms)
        consonants = ''.join([c for c in full_name if c not in 'aeiou' and c.isalpha()])
        consonant_match = 0.0
        if len(acronym) <= len(consonants):
            # Check for sequential consonant match
            acronym_position = 0
            for i, c in enumerate(consonants):
                if acronym_position < len(acronym) and c == acronym[acronym_position]:
                    acronym_position += 1
            
            consonant_sequential_match = acronym_position / len(acronym) if len(acronym) > 0 else 0
            
            # Check for any consonant match
            matches = 0
            consonants_copy = consonants
            for char in acronym:
                if char in consonants_copy:
                    matches += 1
                    consonants_copy = consonants_copy.replace(char, '', 1)
            
            consonant_any_match = matches / len(acronym) if len(acronym) > 0 else 0
            
            # Take the better score
            consonant_match = max(consonant_sequential_match, consonant_any_match)
            
            # Give higher scores for strong consonant matches
            if consonant_match > 0.7:
                return max(0.85, consonant_match)
        
        # Check if acronym characters appear in order in full name
        ordered_match = 0
        last_found_index = -1
        full_name_chars = list(full_name)
        
        for char in acronym:
            found = False
            for i in range(last_found_index + 1, len(full_name_chars)):
                if char == full_name_chars[i]:
                    ordered_match += 1
                    last_found_index = i
                    found = True
                    break
            
            # If we couldn't find the character in order, try looking anywhere
            if not found:
                for i in range(len(full_name_chars)):
                    if i != last_found_index and char == full_name_chars[i]:
                        ordered_match += 0.5  # Half credit for out-of-order match
                        full_name_chars[i] = '_'  # Mark as used
                        break
        
        ordered_match_score = ordered_match / len(acronym) if len(acronym) > 0 else 0
        
        # Check capitals in the full name (businesses often use capitals in their names)
        capitals = ''.join([c for c in full_name if c.isupper()])
        if capitals and acronym.upper() == capitals:
            return 0.95
        
        # Return the best score from different matching strategies
        return max(
            ordered_match_score * 0.9,  # Ordered match is good but not perfect
            consonant_match * 0.9,      # Consonant match is also valuable
            0.4                         # Minimum score to prevent too low values
        )
    
    def get_all_similarity_scores(self, acronym, full_name, domain=None):
        """
        Calculate all similarity scores at once for efficiency
        
        Args:
            acronym (str): The acronym to match
            full_name (str): The full name to match against
            domain (str, optional): Domain for specialized preprocessing
            
        Returns:
            dict: Dictionary of algorithm name to score
        """
        # Return empty dictionary if either acronym or full_name is None
        if acronym is None or full_name is None:
            return {}
        
        # Calculate all similarity scores
        scores = {
            'jaro_winkler': self.jaro_winkler_similarity(acronym, full_name, domain),
            'damerau_levenshtein': self.damerau_levenshtein_similarity(acronym, full_name, domain),
            'tfidf_cosine': self.tfidf_cosine_similarity(acronym, full_name, domain),
            'jaccard_bigram': self.jaccard_bigram_similarity(acronym, full_name, domain),
            'soundex': self.soundex_similarity(acronym, full_name, domain),
            'token_sort_ratio': self.token_sort_ratio_similarity(acronym, full_name, domain),
            'contains_ratio': self.contains_ratio_similarity(acronym, full_name, domain),
            'fuzzy_levenshtein': self.fuzzy_levenshtein_similarity(acronym, full_name, domain),
            'trie_approximate': self.trie_approximate_similarity(acronym, full_name, domain),
            'aho_corasick': self.aho_corasick_similarity(acronym, full_name, domain),
            'acronym_formation': self.acronym_formation_score(acronym, full_name, domain),
            'enhanced_acronym_formation': self.enhanced_acronym_formation_score(acronym, full_name, domain)
        }
        
        # Add BERT similarity if available
        if self.bert_embedder is not None:
            scores['bert_similarity'] = self.bert_similarity(acronym, full_name, domain)
        
        # Add pattern detection scores
        scores.update(self._detect_domain_patterns(acronym, full_name, domain))
        
        return scores
    
    def _detect_domain_patterns(self, acronym, full_name, domain):
        """
        Detect domain-specific patterns in the text
        
        Args:
            acronym (str): The acronym to match
            full_name (str): The full name to match against
            domain (str, optional): Domain for specialized preprocessing
            
        Returns:
            dict: Dictionary of pattern name to score (0 or 1)
        """
        patterns = {}
        
        # Detect brand with location pattern
        patterns['pattern_brand_with_location'] = self._detect_brand_with_location(acronym, full_name, domain)
        
        # Detect corporation suffix pattern
        patterns['pattern_corporation_suffix'] = self._detect_corporation_suffix(acronym, full_name, domain)
        
        # Detect department prefix pattern
        patterns['pattern_department_prefix'] = self._detect_department_prefix(acronym, full_name, domain)
        
        # Add domain-specific patterns
        if domain == 'Medical':
            patterns.update(self._detect_medical_patterns(acronym, full_name))
        elif domain == 'Financial':
            patterns.update(self._detect_financial_patterns(acronym, full_name))
        elif domain == 'Government':
            patterns.update(self._detect_government_patterns(acronym, full_name))
        elif domain == 'Education':
            patterns.update(self._detect_education_patterns(acronym, full_name))
        
        return patterns
    
    def _detect_brand_with_location(self, acronym, full_name, domain):
        """
        Detect if this is a case of a brand name with location prefix/suffix,
        like 'Western Toyota' -> 'Toyota Corporation'
        """
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        words_a = acronym_clean.lower().split()
        words_f = full_name_clean.lower().split()
        
        if len(words_a) <= 1 or len(words_f) <= 1:
            return 0.0
            
        # Common brand identifiers that might appear with location prefixes
        common_brands = ['toyota', 'ford', 'honda', 'bmw', 'walmart', 'target', 
                        'starbucks', 'mcdonalds', 'marriott', 'hilton']
                        
        # Check if any common brand appears in either name
        has_brand_a = any(brand in words_a for brand in common_brands)
        has_brand_f = any(brand in words_f for brand in common_brands)
        
        # Check if either name has a location modifier
        location_modifiers = ['north', 'south', 'east', 'west', 'central', 'downtown',
                             'city', 'regional', 'local', 'western', 'eastern']
        has_location_a = any(loc in words_a for loc in location_modifiers)
        has_location_f = any(loc in words_f for loc in location_modifiers)
        
        # One name has brand, other has brand + location OR one has location, other has brand
        if (has_brand_a and has_brand_f and (has_location_a != has_location_f)):
            return 1.0
        
        return 0.0
    
    def _detect_corporation_suffix(self, acronym, full_name, domain):
        """
        Detect if this is a case where one name has a corporate suffix and the other doesn't
        like 'Toyota' -> 'Toyota Corporation'
        """
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        words_a = acronym_clean.lower().split()
        words_f = full_name_clean.lower().split()
        
        if not words_a or not words_f:
            return 0.0
        
        corporate_suffixes = ['corporation', 'corp', 'inc', 'incorporated', 'llc', 
                             'limited', 'ltd', 'company', 'co', 'group']
        
        # Check if one name ends with a corporate suffix and the other doesn't
        name1_has_suffix = any(words_a[-1] == suffix for suffix in corporate_suffixes)
        name2_has_suffix = any(words_f[-1] == suffix for suffix in corporate_suffixes)
        
        return 1.0 if name1_has_suffix != name2_has_suffix else 0.0
    
    def _detect_department_prefix(self, acronym, full_name, domain):
        """
        Detect if this is a case where one name has a department prefix
        like 'Finance Department' -> 'Department of Finance'
        """
        acronym_clean, full_name_clean = self.preprocess_pair(acronym, full_name, domain)
        dept_terms = ['department', 'dept', 'division', 'office', 'bureau']
        
        # Check for department terms in either name
        has_dept_a = any(term in acronym_clean.lower() for term in dept_terms)
        has_dept_f = any(term in full_name_clean.lower() for term in dept_terms)
        
        # Both have department terms but in different positions
        if has_dept_a and has_dept_f:
            words_a = acronym_clean.lower().split()
            words_f = full_name_clean.lower().split()
            
            dept_pos_a = next((i for i, word in enumerate(words_a) if any(term in word for term in dept_terms)), -1)
            dept_pos_f = next((i for i, word in enumerate(words_f) if any(term in word for term in dept_terms)), -1)
            
            # Department terms in different positions (start vs end) suggests same entity with different naming
            if dept_pos_a != -1 and dept_pos_f != -1:
                # One at beginning, one at end
                if (dept_pos_a == 0 and dept_pos_f == len(words_f) - 1) or \
                   (dept_pos_f == 0 and dept_pos_a == len(words_a) - 1):
                    return 1.0
        
        # One has department term, other doesn't, but common words
        elif has_dept_a != has_dept_f:
            words_a = set(acronym_clean.lower().split())
            words_f = set(full_name_clean.lower().split())
            common_words = words_a.intersection(words_f)
            
            # If they share significant words (excluding stopwords)
            common_words = common_words - self.stopwords
            if len(common_words) >= 1:
                return 1.0
        
        return 0.0
    
    def _detect_medical_patterns(self, acronym, full_name):
        """
        Detect patterns specific to medical domain
        
        Args:
            acronym (str): The acronym to match
            full_name (str): The full name to match against
            
        Returns:
            dict: Dictionary of pattern name to score (0 or 1)
        """
        patterns = {}
        
        # Preprocess
        acronym_clean = self.preprocess_with_domain(acronym, 'Medical')
        full_name_clean = self.preprocess_with_domain(full_name, 'Medical')
        
        # Check for medical center / hospital pattern
        medical_terms = ['hospital', 'medical', 'clinic', 'health', 'healthcare', 'center']
        has_medical_a = any(term in acronym_clean.lower() for term in medical_terms)
        has_medical_f = any(term in full_name_clean.lower() for term in medical_terms)
        
        # If one has medical term and other doesn't, but they share other significant words
        if has_medical_a != has_medical_f:
            words_a = set(acronym_clean.lower().split())
            words_f = set(full_name_clean.lower().split())
            common_words = words_a.intersection(words_f) - self.stopwords - set(medical_terms)
            
            if len(common_words) >= 1:
                patterns['pattern_medical_facility'] = 1.0
            else:
                patterns['pattern_medical_facility'] = 0.0
        else:
            patterns['pattern_medical_facility'] = 0.0
        
        # Detect department / specialty pattern
        specialties = ['cardiology', 'neurology', 'pediatrics', 'oncology', 'radiology', 
                       'orthopedic', 'surgery', 'emergency', 'trauma', 'psychiatric']
        
        has_specialty_a = any(spec in acronym_clean.lower() for spec in specialties)
        has_specialty_f = any(spec in full_name_clean.lower() for spec in specialties)
        
        # If both have specialties but different ones, or one has specialty and other doesn't
        if has_specialty_a or has_specialty_f:
            patterns['pattern_medical_specialty'] = 1.0
        else:
            patterns['pattern_medical_specialty'] = 0.0
        
        return patterns
    
    def _detect_financial_patterns(self, acronym, full_name):
        """
        Detect patterns specific to financial domain
        
        Args:
            acronym (str): The acronym to match
            full_name (str): The full name to match against
            
        Returns:
            dict: Dictionary of pattern name to score (0 or 1)
        """
        patterns = {}
        
        # Preprocess
        acronym_clean = self.preprocess_with_domain(acronym, 'Financial')
        full_name_clean = self.preprocess_with_domain(full_name, 'Financial')
        
        # Check for bank / financial institution pattern
        bank_terms = ['bank', 'financial', 'credit', 'investment', 'capital', 'securities', 'asset']
        has_bank_a = any(term in acronym_clean.lower() for term in bank_terms)
        has_bank_f = any(term in full_name_clean.lower() for term in bank_terms)
        
        # Bank naming patterns often have location + bank
        location_terms = ['america', 'national', 'city', 'state', 'first', 'trust',
                          'international', 'north', 'south', 'east', 'west', 'central']
        has_location_a = any(term in acronym_clean.lower() for term in location_terms)
        has_location_f = any(term in full_name_clean.lower() for term in location_terms)
        
        # Bank + location pattern
        if (has_bank_a and has_bank_f) and (has_location_a or has_location_f):
            patterns['pattern_bank_with_location'] = 1.0
        else:
            patterns['pattern_bank_with_location'] = 0.0
        
        # Check for financial services pattern
        service_terms = ['services', 'management', 'advisors', 'associates', 'group', 'partners']
        has_service_a = any(term in acronym_clean.lower() for term in service_terms)
        has_service_f = any(term in full_name_clean.lower() for term in service_terms)
        
        if has_service_a != has_service_f and (has_bank_a or has_bank_f):
            patterns['pattern_financial_services'] = 1.0
        else:
            patterns['pattern_financial_services'] = 0.0
        
        return patterns
    
    def _detect_government_patterns(self, acronym, full_name):
        """
        Detect patterns specific to government domain
        
        Args:
            acronym (str): The acronym to match
            full_name (str): The full name to match against
            
        Returns:
            dict: Dictionary of pattern name to score (0 or 1)
        """
        patterns = {}
        
        # Preprocess
        acronym_clean = self.preprocess_with_domain(acronym, 'Government')
        full_name_clean = self.preprocess_with_domain(full_name, 'Government')
        
        # Government agency patterns
        agency_terms = ['agency', 'administration', 'authority', 'commission', 'bureau', 
                       'board', 'service', 'office', 'council']
        has_agency_a = any(term in acronym_clean.lower() for term in agency_terms)
        has_agency_f = any(term in full_name_clean.lower() for term in agency_terms)
        
        # Classic government acronym pattern: words -> initialism
        words_f = full_name_clean.lower().split()
        if len(words_f) >= 3 and len(acronym_clean) >= 2:
            # Check if acronym consists of first letters
            first_letters = ''.join(word[0] for word in words_f if word)
            if acronym_clean.lower() in first_letters.lower():
                patterns['pattern_government_initialism'] = 1.0
            else:
                patterns['pattern_government_initialism'] = 0.0
        else:
            patterns['pattern_government_initialism'] = 0.0
        
        # Department of X vs X Department pattern
        if has_agency_a or has_agency_f:
            patterns['pattern_government_agency'] = 1.0
        else:
            patterns['pattern_government_agency'] = 0.0
        
        return patterns
    
    def _detect_education_patterns(self, acronym, full_name):
        """
        Detect patterns specific to education domain
        
        Args:
            acronym (str): The acronym to match
            full_name (str): The full name to match against
            
        Returns:
            dict: Dictionary of pattern name to score (0 or 1)
        """
        patterns = {}
        
        # Preprocess
        acronym_clean = self.preprocess_with_domain(acronym, 'Education')
        full_name_clean = self.preprocess_with_domain(full_name, 'Education')
        
        # Education institution patterns
        edu_terms = ['university', 'college', 'school', 'institute', 'academy', 
                    'education', 'learning', 'studies']
        has_edu_a = any(term in acronym_clean.lower() for term in edu_terms)
        has_edu_f = any(term in full_name_clean.lower() for term in edu_terms)
        
        # Common pattern: University of X vs X University
        if has_edu_a and has_edu_f:
            words_a = acronym_clean.lower().split()
            words_f = full_name_clean.lower().split()
            
            # Check for "X University" vs "University of X" pattern
            uni_pos_a = next((i for i, word in enumerate(words_a) if word in edu_terms), -1)
            uni_pos_f = next((i for i, word in enumerate(words_f) if word in edu_terms), -1)
            
            if uni_pos_a != -1 and uni_pos_f != -1:
                # One at beginning, one at end or different positions
                if uni_pos_a != uni_pos_f:
                    patterns['pattern_edu_institution_name_variation'] = 1.0
                else:
                    patterns['pattern_edu_institution_name_variation'] = 0.0
            else:
                patterns['pattern_edu_institution_name_variation'] = 0.0
        else:
            patterns['pattern_edu_institution_name_variation'] = 0.0
        
        # Department/School within university pattern
        dept_terms = ['department', 'dept', 'school', 'faculty', 'college']
        has_dept_a = any(term in acronym_clean.lower() for term in dept_terms)
        has_dept_f = any(term in full_name_clean.lower() for term in dept_terms)
        
        if has_dept_a != has_dept_f and (has_edu_a or has_edu_f):
            patterns['pattern_edu_department'] = 1.0
        else:
            patterns['pattern_edu_department'] = 0.0
        
        return patterns
    
    def compute_weighted_score(self, acronym, full_name, domain=None):
        """
        Compute weighted similarity score using pre-defined weights
        
        Args:
            acronym (str): The acronym to match
            full_name (str): The full name to match against
            domain (str, optional): Domain for weighting and preprocessing
            
        Returns:
            float: Weighted similarity score between 0 and 1
        """
        # Get all similarity scores
        scores = self.get_all_similarity_scores(acronym, full_name, domain)
        
        # Get domain-specific weights
        weights = self._get_domain_specific_weights(domain)
        
        # Calculate weighted score
        weighted_score = 0.0
        weights_used = 0.0
        
        for algo, score in scores.items():
            if algo in weights:
                weighted_score += weights[algo] * score
                weights_used += weights[algo]
        
        # Handle case where some algorithms are missing
        if weights_used > 0:
            # Normalize by weights actually used
            weighted_score /= weights_used
        
        # Apply special pattern boosts
        pattern_boost = 1.0
        for algo, score in scores.items():
            if algo.startswith('pattern_') and score > 0:
                # Different boost factors for different patterns
                if 'brand_with_location' in algo:
                    pattern_boost += 0.25  # 25% boost for brand with location
                elif 'corporation_suffix' in algo:
                    pattern_boost += 0.20  # 20% boost for corporation suffix
                elif 'department_prefix' in algo:
                    pattern_boost += 0.20  # 20% boost for department prefix
                elif 'medical' in algo:
                    pattern_boost += 0.25  # 25% boost for medical patterns
                elif 'bank' in algo:
                    pattern_boost += 0.25  # 25% boost for bank patterns
                elif 'government' in algo:
                    pattern_boost += 0.20  # 20% boost for government patterns
                elif 'edu' in algo:
                    pattern_boost += 0.20  # 20% boost for education patterns
                else:
                    pattern_boost += 0.15  # 15% boost for other patterns
        
        # Apply the pattern boost, cap at 1.5 (50% boost max)
        weighted_score = min(1.0, weighted_score * min(pattern_boost, 1.5))
        
        # Boost scores that are already reasonably good
        if weighted_score > 0.75:
            weighted_score = min(1.0, weighted_score * 1.2)  # 20% boost for high scores
        elif weighted_score > 0.6:
            weighted_score = min(1.0, weighted_score * 1.15)  # 15% boost for good scores
        
        return weighted_score
    
    def _get_domain_specific_weights(self, domain):
        """
        Get domain-specific weights
        
        Args:
            domain (str): The domain to get weights for
            
        Returns:
            dict: Dictionary of algorithm weights
        """
        if domain in self.domain_weights:
            # Start with default weights
            weights = self.domain_weights['default'].copy()
            
            # Update with domain-specific weights
            weights.update(self.domain_weights[domain])
            
            # Normalize weights to sum to 1
            weight_sum = sum(weights.values())
            return {k: v/weight_sum for k, v in weights.items()}
        else:
            # Return normalized default weights
            weights = self.domain_weights['default'].copy()
            weight_sum = sum(weights.values())
            return {k: v/weight_sum for k, v in weights.items()}
    
    def compute_enhanced_score(self, acronym, full_name, domain=None):
        """
        Compute enhanced score with additional pattern recognition and boosting
        
        Args:
            acronym (str): The acronym to match
            full_name (str): The full name to match against
            domain (str, optional): Domain for weighting and preprocessing
            
        Returns:
            float: Enhanced similarity score between 0 and 1
        """
        # First get the base weighted score
        base_score = self.compute_weighted_score(acronym, full_name, domain)
        
        # Get all scores for pattern detection
        all_scores = self.get_all_similarity_scores(acronym, full_name, domain)
        
        # Apply additional pattern-based boosting for specific domains
        enhanced_score = base_score
        
        # Check for exact matches in domain-specific patterns
        if domain == 'Restaurant':
            # Special case for McDonalds patterns
            if (acronym.upper().startswith('MC') and 'donald' in full_name.lower()) or \
               (full_name.upper().startswith('MC') and 'donald' in acronym.lower()):
                enhanced_score = min(1.0, enhanced_score * 1.4)  # 40% boost
            
            # Special case for common restaurant chains
            restaurant_chains = ['starbucks', 'mcdonalds', 'wendys', 'burger king', 
                                'taco bell', 'pizza hut', 'subway', 'kfc']
            if any(chain in acronym.lower() for chain in restaurant_chains) and \
               any(chain in full_name.lower() for chain in restaurant_chains):
                enhanced_score = min(1.0, enhanced_score * 1.3)  # 30% boost
            
        elif domain == 'Automotive':
            # Special case for Toyota with location
            if (('toyota' in acronym.lower() and any(loc in full_name.lower() for loc in ['north', 'south', 'east', 'west'])) or \
                ('toyota' in full_name.lower() and any(loc in acronym.lower() for loc in ['north', 'south', 'east', 'west']))):
                enhanced_score = min(1.0, enhanced_score * 1.35)  # 35% boost
            
        elif domain == 'Banking':
            # Special case for Bank of X patterns
            if 'bank of' in acronym.lower() or 'bank of' in full_name.lower():
                enhanced_score = min(1.0, enhanced_score * 1.3)  # 30% boost
            
        elif domain == 'Medical':
            # Special case for Hospital/Medical Center patterns
            medical_terms = ['hospital', 'medical center', 'clinic', 'healthcare']
            if any(term in acronym.lower() for term in medical_terms) or \
               any(term in full_name.lower() for term in medical_terms):
                enhanced_score = min(1.0, enhanced_score * 1.25)  # 25% boost
            
        # Check for very high individual algorithm scores
        high_score_algos = ['bert_similarity', 'enhanced_acronym_formation', 'jaro_winkler']
        for algo in high_score_algos:
            if algo in all_scores and all_scores[algo] > 0.9:
                enhanced_score = min(1.0, enhanced_score * 1.2)  # 20% boost for high individual scores
                break
        
        # Final capping and boosting
        if enhanced_score > 0.8:
            # Already high score, apply final boost
            enhanced_score = min(1.0, enhanced_score * 1.15)  # 15% final boost for high scores
        
        return enhanced_score

# Initialize enhanced merchant matcher with pre-trained BERT model
merchant_matcher = EnhancedMerchantMatcher(bert_embedder=bert_embedder)
print("Enhanced merchant matcher initialized with pre-trained models!")

Enhanced merchant matcher initialized with pre-trained models!


In [7]:
# Cell 4: Common Acronyms Dictionary

# Define dictionary of common acronyms for well-known brands
COMMON_ACRONYMS = {
    # Restaurant chains
    'MCD': 'McDonalds',
    'MD': 'McDonalds',
    'MCDs': 'McDonalds',
    'MCDS': 'McDonalds',
    'BK': 'Burger King',
    'KFC': 'Kentucky Fried Chicken',
    'SB': 'Starbucks',
    'SBUX': 'Starbucks',
    'TB': 'Taco Bell',
    'WEN': 'Wendys',
    'DQ': 'Dairy Queen',
    'PH': 'Pizza Hut',
    'DNKN': 'Dunkin Donuts',
    'CFA': 'Chick-fil-A',
    'CMG': 'Chipotle Mexican Grill',
    
    # Banking and Financial institutions
    'BAC': 'Bank of America',
    'BOFA': 'Bank of America',
    'JPM': 'JPMorgan Chase',
    'WFC': 'Wells Fargo',
    'C': 'Citigroup',
    'GS': 'Goldman Sachs',
    'MS': 'Morgan Stanley',
    'AXP': 'American Express',
    'HSBC': 'Hongkong and Shanghai Banking Corporation',
    
    # Technology companies
    'MSFT': 'Microsoft',
    'AAPL': 'Apple',
    'GOOGL': 'Google',
    'GOOG': 'Google',
    'AMZN': 'Amazon',
    'FB': 'Facebook',
    'META': 'Meta Platforms',
    'NFLX': 'Netflix',
    'TSLA': 'Tesla',
    
    # Automotive companies
    'TM': 'Toyota Motor',
    'TOYOF': 'Toyota',
    'TOYOTA': 'Toyota Corporation',
    'F': 'Ford',
    'GM': 'General Motors',
    'HMC': 'Honda Motor Company',
    'HNDAF': 'Honda',
    'NSANY': 'Nissan',
    'BMWYY': 'BMW',
    'VWAGY': 'Volkswagen',
    
    # Retail companies
    'WMT': 'Walmart',
    'TGT': 'Target',
    'COST': 'Costco',
    'HD': 'Home Depot',
    'LOW': 'Lowes',
    'BBY': 'Best Buy',
    'EBAY': 'eBay',
    'DG': 'Dollar General',
    'DLTR': 'Dollar Tree',
}

print(f"Dictionary of {len(COMMON_ACRONYMS)} common acronyms defined!")

Dictionary of 52 common acronyms defined!


In [21]:
# Cell 5: Data Loading and Processing Functions
import pandas as pd

def load_merchant_data(file_path):
    """
    Load merchant data from Excel file, with fallback to sample data if file not found
    
    Args:
        file_path (str): Path to the Excel file containing merchant data
        
    Returns:
        DataFrame: Pandas DataFrame with merchant data
    """
    try:
        # Read the Excel file
        df = pd.read_excel(file_path)
        
        # Display basic information
        print(f"Loaded {len(df)} merchant entries from {file_path}")
        print(f"Columns: {df.columns.tolist()}")
        print(f"\nSample data:")
        print(df.head(3))
        
        return df
    
    except Exception as e:
        print(f"Error loading merchant data: {e}")
        print("Using sample data instead...")
        
        # Create a sample dataframe with restaurant examples and Toyota examples
        sample_data = {
            'Acronym': ['ANZ', 'MCD', 'MD', 'MLD', 'Western Toyota', 'Mosman Toyota', 
                       'AMZN', 'GOOG', 'MS', 'WMT'],
            'Full_Name': ['Australia and New Zealand Banking Group', 'McDonalds', 'McDonalds', 
                         'McDonalds', 'Toyota Corporation', 'Toyota Corporation',
                         'Amazon', 'Google', 'Morgan Stanley', 'Walmart'],
            'Merchant_Category': ['Banking', 'Restaurant', 'Restaurant', 'Restaurant', 'Automotive', 
                                 'Automotive', 'Retail', 'Technology', 'Finance', 'Retail']
        }
        df = pd.DataFrame(sample_data)
        print(df)
        return df

def standardize_column_names(df):
    """
    Standardize column names to ensure consistency
    
    Args:
        df (DataFrame): Input DataFrame
        
    Returns:
        DataFrame: DataFrame with standardized column names
    """
    # Create a copy to avoid modifying the original
    df_copy = df.copy()
    
    # Map of possible column names to standardized names
    column_mappings = {
        'Full Name': 'Full_Name',
        'Full_name': 'Full_Name',
        'fullname': 'Full_Name',
        'full_name': 'Full_Name',
        'Merchant Category': 'Merchant_Category',
        'merchant_category': 'Merchant_Category',
        'Category': 'Merchant_Category',
        'category': 'Merchant_Category',
        'acronym': 'Acronym',
        'Abbreviation': 'Acronym',
        'ShortName': 'Acronym',
        'Short_Name': 'Acronym',
        'short_name': 'Acronym'
    }
    
    # Apply mapping
    for old_name, new_name in column_mappings.items():
        if old_name in df_copy.columns:
            df_copy.rename(columns={old_name: new_name}, inplace=True)
    
    # Ensure required columns exist
    required_columns = ['Acronym', 'Full_Name']
    missing_columns = [col for col in required_columns if col not in df_copy.columns]
    
    if missing_columns:
        raise ValueError(f"Required columns {missing_columns} not found in the DataFrame")
    
    # If Merchant_Category is missing, add a default value
    if 'Merchant_Category' not in df_copy.columns:
        print("Warning: 'Merchant_Category' column not found. Adding with default value 'Unknown'.")
        df_copy['Merchant_Category'] = 'Unknown'
    
    return df_copy

def preprocess_merchant_data(df):
    """
    Preprocess merchant data for matching
    
    Args:
        df (DataFrame): Input DataFrame
        
    Returns:
        DataFrame: Preprocessed DataFrame
    """
    # Standardize column names
    df = standardize_column_names(df)
    
    # Make a copy to avoid modifying the original
    df_processed = df.copy()
    
    # Handle missing values
    df_processed['Acronym'] = df_processed['Acronym'].fillna('').astype(str)
    df_processed['Full_Name'] = df_processed['Full_Name'].fillna('').astype(str)
    
    # Remove rows with empty acronyms or full names
    orig_rows = len(df_processed)
    df_processed = df_processed[(df_processed['Acronym'].str.strip() != '') & 
                                (df_processed['Full_Name'].str.strip() != '')]
    
    if len(df_processed) < orig_rows:
        print(f"Removed {orig_rows - len(df_processed)} rows with empty acronyms or full names")
    
    # Map categories to standard domains
    standard_domains = {
        'Restaurant': ['restaurant', 'food', 'dining', 'cafe', 'coffee', 'fast food'],
        'Banking': ['banking', 'bank', 'financial institution', 'credit union'],
        'Retail': ['retail', 'store', 'shop', 'department store', 'supermarket', 'grocery'],
        'Technology': ['technology', 'tech', 'software', 'hardware', 'electronics', 'computer'],
        'Automotive': ['automotive', 'auto', 'car', 'vehicle', 'dealership'],
        'Medical': ['medical', 'health', 'healthcare', 'hospital', 'clinic', 'pharmacy'],
        'Government': ['government', 'gov', 'agency', 'federal', 'state', 'municipal'],
        'Education': ['education', 'school', 'university', 'college', 'academic'],
        'Financial': ['financial', 'finance', 'investment', 'insurance', 'wealth management']
    }
    
    def map_to_standard_domain(category):
        category_lower = category.lower()
        for domain, keywords in standard_domains.items():
            if any(keyword in category_lower for keyword in keywords):
                return domain
        return category  # Return original if no match
    
    # Apply domain mapping
    df_processed['Merchant_Category'] = df_processed['Merchant_Category'].apply(map_to_standard_domain)
    
    # Print category distribution
    print("Category distribution after preprocessing:")
    print(df_processed['Merchant_Category'].value_counts().head(10))
    
    return df_processed

# Cell 7: Main Execution Pipeline
import time

def run_merchant_matching_pipeline(input_file, output_file=None):
    """
    Run the complete merchant matching pipeline using pre-trained models
    
    Args:
        input_file (str): Path to input file
        output_file (str): Path to save results
        
    Returns:
        DataFrame: Results DataFrame
    """
    start_time = time.time()
    print(f"Running merchant matching pipeline with pre-trained models...")
    print(f"Input file: {input_file}")
    
    # Step 1: Load merchant data
    print("\nStep 1: Loading merchant data...")
    merchant_df = load_merchant_data(input_file)
    
    # Step 2: Preprocess merchant data
    print("\nStep 2: Preprocessing merchant data...")
    processed_df = preprocess_merchant_data(merchant_df)
    
    # Step 3: Compute similarity scores using pre-trained models
    print("\nStep 3: Computing similarity scores using pre-trained models...")
    results_df = process_merchant_data(processed_df)
    
    # Step 4: Analyze results
    print("\nStep 4: Analyzing results...")
    analysis = analyze_merchant_results(results_df)
    
    # Save results if output file provided
    if output_file:
        print(f"\nSaving results to {output_file}...")
        categorized_df = add_match_categories(results_df)
        categorized_df.to_excel(output_file, index=False)
        print(f"Results saved successfully!")
    
    total_time = time.time() - start_time
    print(f"\nMerchant matching pipeline completed in {total_time:.2f} seconds")
    
    return results_df

# Define thresholds for match categorization
thresholds = {
    'Exact Match': 0.95,
    'Strong Match': 0.85,
    'Probable Match': 0.75,
    'Possible Match': 0.65,
    'Weak Match': 0.50,
    'No Match': 0.0
}

# Example usage:
# Set up file paths
input_file = "Acronym_Categorized.xlsx"  # Your file name is already correctly specified here
output_file = "merchant_matching_results.xlsx" 

# Run the pipeline! (Uncomment to execute)
# results = run_merchant_matching_pipeline(input_file, output_file)
# print("\nMerchant matching pipeline with pre-trained models completed successfully!")

In [23]:
# Cell 6: Merchant Matching Functions

def process_merchant_data(merchant_df):
    """
    Process merchant data and compute similarity scores using pre-trained models
    
    Args:
        merchant_df (DataFrame): Merchant data DataFrame
        
    Returns:
        DataFrame: DataFrame with similarity scores
    """
    start_time = time.time()
    print(f"Processing {len(merchant_df)} merchant entries...")
    
    # Create a copy of the input DataFrame
    results_df = merchant_df.copy()
    
    # Add columns for similarity scores
    results_df['Basic_Score'] = 0.0
    results_df['Enhanced_Score'] = 0.0
    
    # Create progress tracking
    batch_size = max(1, len(results_df) // 10)  # Show progress in ~10 steps
    
    # Process each merchant entry
    for idx, row in results_df.iterrows():
        acronym = row['Acronym']
        full_name = row['Full_Name']
        category = row['Merchant_Category']
        
        # Basic string preprocessing
        acronym = str(acronym).strip()
        full_name = str(full_name).strip()
        
        # Special case handling for exact matches from dictionary
        acronym_upper = acronym.upper()
        if acronym_upper in COMMON_ACRONYMS and merchant_matcher.jaro_winkler_similarity(
                COMMON_ACRONYMS[acronym_upper], full_name) > 0.85:
            # Known exact match gets maximum score
            results_df.at[idx, 'Basic_Score'] = 0.95
            results_df.at[idx, 'Enhanced_Score'] = 0.98
            continue
            
        # Special case for McDonald's variants
        if (acronym_upper in ['MCD', 'MD', 'MCDs', 'MCDS'] and 
              merchant_matcher.jaro_winkler_similarity('McDonalds', full_name) > 0.7):
            results_df.at[idx, 'Basic_Score'] = 0.93
            results_df.at[idx, 'Enhanced_Score'] = 0.96
            continue
            
        # Special case for Toyota with location
        if ((('toyota' in acronym.lower() and any(loc in full_name.lower() for loc in ['north', 'south', 'east', 'west', 'western', 'eastern'])) or 
               ('toyota' in full_name.lower() and any(loc in acronym.lower() for loc in ['north', 'south', 'east', 'west', 'western', 'eastern'])))):
            results_df.at[idx, 'Basic_Score'] = 0.92
            results_df.at[idx, 'Enhanced_Score'] = 0.95
            continue
        
        # Compute similarity scores
        basic_score = merchant_matcher.compute_weighted_score(acronym, full_name, category)
        enhanced_score = merchant_matcher.compute_enhanced_score(acronym, full_name, category)
        
        # Store scores
        results_df.at[idx, 'Basic_Score'] = basic_score
        results_df.at[idx, 'Enhanced_Score'] = enhanced_score
        
        # Show progress
        if idx % batch_size == 0 or idx == len(results_df) - 1:
            progress = (idx + 1) / len(results_df) * 100
            elapsed = time.time() - start_time
            remaining = elapsed / (idx + 1) * (len(results_df) - idx - 1) if idx > 0 else 0
            print(f"Progress: {progress:.1f}% ({idx+1}/{len(results_df)}) - "
                  f"Elapsed: {elapsed:.1f}s - Est. remaining: {remaining:.1f}s")
    
    total_time = time.time() - start_time
    print(f"Processing completed in {total_time:.2f} seconds")
    
    return results_df

def add_match_categories(results_df, thresholds=None):
    """
    Add match categories based on thresholds
    
    Args:
        results_df (DataFrame): Results DataFrame with similarity scores
        thresholds (dict): Thresholds for categorization
        
    Returns:
        DataFrame: DataFrame with match categories
    """
    if thresholds is None:
        thresholds = {
            'Exact Match': 0.95,
            'Strong Match': 0.85,
            'Probable Match': 0.75,
            'Possible Match': 0.65,
            'Weak Match': 0.50,
            'No Match': 0.0
        }
    
    df = results_df.copy()
    
    # Add category column based on Enhanced_Score
    df['Match_Category'] = 'No Match'
    
    # Apply thresholds in reverse order (highest first)
    for category, threshold in sorted(thresholds.items(), key=lambda x: x[1], reverse=True):
        df.loc[df['Enhanced_Score'] >= threshold, 'Match_Category'] = category
    
    # Print distribution of match categories
    print("\nMatch category distribution:")
    category_counts = df['Match_Category'].value_counts().sort_index()
    for category, count in category_counts.items():
        percentage = count / len(df) * 100
        print(f"  {category}: {count} ({percentage:.1f}%)")
    
    return df

def analyze_merchant_results(results_df, sample_size=5):
    """
    Analyze merchant matching results and print detailed information
    
    Args:
        results_df (DataFrame): Results DataFrame with similarity scores
        sample_size (int): Number of samples to show for each category
        
    Returns:
        dict: Analysis results
    """
    # Add match categories
    categorized_df = add_match_categories(results_df)
    
    # Calculate overall statistics
    mean_basic = categorized_df['Basic_Score'].mean()
    mean_enhanced = categorized_df['Enhanced_Score'].mean()
    improvement = (mean_enhanced - mean_basic) / mean_basic * 100 if mean_basic > 0 else 0
    
    print(f"\nOverall Statistics:")
    print(f"  Average Basic Score: {mean_basic:.4f}")
    print(f"  Average Enhanced Score: {mean_enhanced:.4f}")
    print(f"  Overall Improvement: {improvement:.2f}%")
    
    # Print samples for each category
    categories = categorized_df['Match_Category'].unique()
    print("\nSample matches by category:")
    
    for category in sorted(categories, key=lambda x: thresholds.get(x, 0), reverse=True):
        cat_df = categorized_df[categorized_df['Match_Category'] == category]
        cat_samples = min(sample_size, len(cat_df))
        
        if cat_samples > 0:
            print(f"\n{category} ({len(cat_df)} entries):")
            samples = cat_df.sample(cat_samples) if cat_samples < len(cat_df) else cat_df
            
            for _, row in samples.iterrows():
                print(f"  {row['Acronym']} <-> {row['Full_Name']} "
                      f"(Category: {row['Merchant_Category']}, Score: {row['Enhanced_Score']:.4f})")
    
    # Analyze by merchant category
    print("\nPerformance by Merchant Category:")
    
    category_stats = {}
    for category in categorized_df['Merchant_Category'].unique():
        cat_df = categorized_df[categorized_df['Merchant_Category'] == category]
        
        basic_mean = cat_df['Basic_Score'].mean()
        enhanced_mean = cat_df['Enhanced_Score'].mean()
        cat_improvement = (enhanced_mean - basic_mean) / basic_mean * 100 if basic_mean > 0 else 0
        
        category_stats[category] = {
            'count': len(cat_df),
            'basic_mean': basic_mean,
            'enhanced_mean': enhanced_mean,
            'improvement': cat_improvement
        }
        
        print(f"  {category} ({len(cat_df)} entries):")
        print(f"    Basic Score: {basic_mean:.4f}")
        print(f"    Enhanced Score: {enhanced_mean:.4f}")
        print(f"    Improvement: {cat_improvement:.2f}%")
    
    # Identify most improved matches
    categorized_df['Improvement'] = categorized_df['Enhanced_Score'] - categorized_df['Basic_Score']
    most_improved = categorized_df.nlargest(sample_size, 'Improvement')
    
    print("\nMost improved matches:")
    for _, row in most_improved.iterrows():
        improvement = row['Improvement']
        improvement_pct = improvement / row['Basic_Score'] * 100 if row['Basic_Score'] > 0 else float('inf')
        
        print(f"  {row['Acronym']} <-> {row['Full_Name']} "
              f"(Category: {row['Merchant_Category']})")
        print(f"    Basic: {row['Basic_Score']:.4f}, Enhanced: {row['Enhanced_Score']:.4f}, "
              f"Improvement: {improvement:.4f} ({improvement_pct:.1f}%)")
    
    return {
        'overall_stats': {
            'mean_basic': mean_basic,
            'mean_enhanced': mean_enhanced,
            'improvement': improvement
        },
        'category_stats': category_stats
    }

print("Merchant matching functions defined!")

Merchant matching functions defined!


In [25]:
# Cell 7: Main Execution Pipeline
def run_merchant_matching_pipeline(input_file, output_file=None):
    """
    Run the complete merchant matching pipeline using pre-trained models
    
    Args:
        input_file (str): Path to input file
        output_file (str): Path to save results
        
    Returns:
        DataFrame: Results DataFrame
    """
    start_time = time.time()
    print(f"Running merchant matching pipeline with pre-trained models...")
    print(f"Input file: {input_file}")
    
    # Step 1: Load merchant data
    print("\nStep 1: Loading merchant data...")
    try:
        # Fix: Use pd.read_excel to load Excel files
        merchant_df = pd.read_excel(input_file)
    except Exception as e:
        print(f"Error loading merchant data: {str(e)}")
        return None
    
    # Step 2: Preprocess merchant data
    print("\nStep 2: Preprocessing merchant data...")
    processed_df = preprocess_merchant_data(merchant_df)
    
    # Step 3: Compute similarity scores using pre-trained models
    print("\nStep 3: Computing similarity scores using pre-trained models...")
    results_df = process_merchant_data(processed_df)
    
    # Step 4: Analyze results
    print("\nStep 4: Analyzing results...")
    analysis = analyze_merchant_results(results_df)
    
    # Save results if output file provided
    if output_file:
        print(f"\nSaving results to {output_file}...")
        categorized_df = add_match_categories(results_df)
        categorized_df.to_excel(output_file, index=False)
        print(f"Results saved successfully!")
    
    total_time = time.time() - start_time
    print(f"\nMerchant matching pipeline completed in {total_time:.2f} seconds")
    
    return results_df

# Define thresholds for match categorization
thresholds = {
    'Exact Match': 0.95,
    'Strong Match': 0.85,
    'Probable Match': 0.75,
    'Possible Match': 0.65,
    'Weak Match': 0.50,
    'No Match': 0.0
}

# Set up file paths
input_file = "Acronym_Categorized.xlsx"  # Update with your actual file path
output_file = "merchant_matching_results.xlsx"

# Ensure pandas is imported
import pandas as pd
import time

# Run the pipeline!
results = run_merchant_matching_pipeline(input_file, output_file)
print("\nMerchant matching pipeline with pre-trained models completed successfully!")

Running merchant matching pipeline with pre-trained models...
Input file: Acronym_Categorized.xlsx

Step 1: Loading merchant data...

Step 2: Preprocessing merchant data...
Category distribution after preprocessing:
Merchant_Category
Government         43
Banking             8
Retail              6
Misc Speciality     6
Technology          6
Automotive          5
Restaurant          5
Clothing            4
Medical             3
Telecom             2
Name: count, dtype: int64

Step 3: Computing similarity scores using pre-trained models...
Processing 100 merchant entries...
Progress: 1.0% (1/100) - Elapsed: 0.1s - Est. remaining: 0.0s
Progress: 11.0% (11/100) - Elapsed: 0.9s - Est. remaining: 7.2s
Progress: 21.0% (21/100) - Elapsed: 1.6s - Est. remaining: 6.0s
Progress: 31.0% (31/100) - Elapsed: 2.3s - Est. remaining: 5.1s
Progress: 41.0% (41/100) - Elapsed: 3.0s - Est. remaining: 4.4s
Progress: 51.0% (51/100) - Elapsed: 3.9s - Est. remaining: 3.7s
Progress: 61.0% (61/100) - Elapsed: 4.