Strategy1

In [None]:
# %% Cell 1: Import Libraries
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from Levenshtein import jaro_winkler, distance as lev_distance
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import FastText
from sentence_transformers import SentenceTransformer
from nltk import ngrams
from fuzzy import DoubleMetaphone
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve, auc, confusion_matrix
from wordcloud import WordCloud
import joblib

print("✅ Libraries imported successfully!")

In [None]:
# %% Cell 2: Preprocessing Class
class MerchantPreprocessor:
    def __init__(self):
        self.abbreviations = {
            'bofa': 'bank of america', 'b of a': 'bank of america',
            'boa': 'bank of america', 'j&j': 'johnson & johnson',
            'jj': 'johnson johnson', 'jnj': 'johnson and johnson',
            'ibm': 'international business machines', 'amex': 'american express',
            'wf': 'wells fargo', 'wm': 'walmart', 'sbux': 'starbucks',
            'hd': 'home depot', 'cvs': 'cvs pharmacy', 'mcd': 'mcdonalds',
            '7-11': '7-eleven', '711': '7-eleven', 'rd': 'road', 
            'st': 'street', 'ave': 'avenue', 'blvd': 'boulevard',
            'ctr': 'center', 'ln': 'lane', 'dr': 'drive'
        }
        
        self.stopwords = {'inc', 'llc', 'co', 'ltd', 'corp', 'plc', 'na', 'the'}

    def preprocess(self, text):
        text = text.lower()
        text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
        tokens = text.split()
        tokens = [self.abbreviations.get(token, token) for token in tokens]
        tokens = [self.correct_typo(token) for token in tokens]
        tokens = [token for token in tokens if token not in self.stopwords]
        return ' '.join(tokens)
    
    def correct_typo(self, token, max_distance=2):
        if token in self.abbreviations.values():
            return token
        closest = (token, 0)
        for correct in self.abbreviations.values():
            dist = lev_distance(token, correct)
            if dist < max_distance and dist > closest[1]:
                closest = (correct, dist)
        return closest[0]

# Test preprocessing
preprocessor = MerchantPreprocessor()
test_samples = [
    ("McDonald's", "mcdonalds"),
    ("Starbucks Coffee Co.", "SBUX"),
    ("123 Main St. NW", "123 main street northwest")
]

print("🔄 Testing preprocessing:")
for original, expected in test_samples:
    cleaned = preprocessor.preprocess(original)
    print(f"Original: {original}\nCleaned: {cleaned}\nMatch: {cleaned == expected}\n")

In [None]:
# %% Cell 3: Load and Preprocess Data
# Load sample data (replace with your actual data)
data = {
    'original': ["McDonald's", "Walmart Supercenter", "Bank of America Corp"],
    'variant': ["McDonalds", "Wal-Mart", "BofA"],
    'expected_binary': [1, 1, 1]
}
df = pd.DataFrame(data)

# Preprocess data
print("🛠 Preprocessing data...")
df['clean_orig'] = df['original'].apply(preprocessor.preprocess)
df['clean_var'] = df['variant'].apply(preprocessor.preprocess)

# Show preprocessing results
print("\n🔍 Preprocessing Results:")
display(df[['original', 'clean_orig', 'variant', 'clean_var']])

In [None]:
# %% Cell 4: Initialize Feature Engineering
print("⚙️ Initializing feature engines...")
class SimilarityFeatures:
    def __init__(self):
        self.tfidf = TfidfVectorizer()
        print("Loading FastText model...")
        self.fasttext = FastText.load_fasttext_format('cc.en.300.bin')
        print("Loading BERT model...")
        self.bert = SentenceTransformer('all-MiniLM-L6-v2')
        self.dmeta = DoubleMetaphone()
        
    # ... (keep all feature methods from previous answer)

feature_engine = SimilarityFeatures()
print("✅ Feature engines initialized!")

In [None]:
# %% Cell 5: Generate Similarity Features
print("🧮 Calculating similarity features...")
features = []
for _, row in df.iterrows():
    features.append(feature_engine.get_features(row['clean_orig'], row['clean_var']))

feature_cols = ['jaro_winkler', 'levenshtein', 'tfidf_cos', 
               'fasttext', 'bert', 'jaccard', 'phonetic']
X = pd.DataFrame(features, columns=feature_cols)
y = df['expected_binary']

print("\n📊 Generated Features Sample:")
display(X.head())

In [None]:
# %% Cell 6: Train-Test Split & Model Training
print("🤖 Training XGBoost model...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1
)
model.fit(X_train, y_train)

print("✅ Model trained successfully!")

In [None]:
# %% Cell 7: Model Evaluation
print("📈 Model Evaluation:")
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

# Classification report
print("\n📝 Classification Report:")
print(classification_report(y_test, y_pred))

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# %% Cell 8: Feature Importance
print("🔍 Feature Importance:")
xgb.plot_importance(model)
plt.title('XGBoost Feature Importance')
plt.show()

# Confusion Matrix
print("\n📊 Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# %% Cell 9: Save Model & Preprocessor
print("💾 Saving artifacts...")
joblib.dump(preprocessor, 'preprocessor.pkl')
joblib.dump(feature_engine, 'feature_engine.pkl')
model.save_model('merchant_matcher.xgb')

print("✅ Artifacts saved:")
!ls -lh *.pkl *.xgb

In [None]:
# %% Cell 10: Example Prediction
print("🔮 Sample Prediction:")
sample_data = {
    'original': ['Home Depot Inc'],
    'variant': ['The Home Depot LLC']
}

sample_df = pd.DataFrame(sample_data)
sample_df['clean_orig'] = sample_df['original'].apply(preprocessor.preprocess)
sample_df['clean_var'] = sample_df['variant'].apply(preprocessor.preprocess)

features = []
for _, row in sample_df.iterrows():
    features.append(feature_engine.get_features(row['clean_orig'], row['clean_var']))

X_sample = pd.DataFrame(features, columns=feature_cols)
prediction = model.predict_proba(X_sample)[0][1]

print(f"\nOriginal: {sample_data['original'][0]}")
print(f"Variant: {sample_data['variant'][0]}")
print(f"Match Probability: {prediction:.2%}")

To use this notebook:

Create new cells between these sections as needed

Add !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin for FastText

Install requirements:

In [None]:
pip install gensim sentence-transformers xgboost Levenshtein fuzzy wordcloud

Run cells sequentially

Intermediate outputs will show:

Preprocessing results

Feature samples

Evaluation metrics

Visualizations

Model artifacts

Example predictions

Each section can be modified independently:

Adjust preprocessing rules in Cell 2

Modify/add features in Cell 4

Tune model parameters in Cell 6

Add new visualizations in Cell 7-8

Create new prediction examples in Cell 10

Strategy2

In [None]:
# Basic data handling libraries
import pandas as pd
import numpy as np
import re
import string
from collections import defaultdict

# String similarity metrics
from Levenshtein import distance as levenshtein_distance
from Levenshtein import jaro_winkler

# Text processing and feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import ngrams
import nltk

# ML libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve, f1_score, confusion_matrix

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLTK resources (run once)
nltk.download('punkt')

print("All libraries imported successfully!")

In [None]:
#2 
# Since we don't have the actual data, let's create a synthetic dataset
# with merchant name variations that represents real-world challenges

def create_sample_data(n_samples=100):
    # Original merchant names
    merchant_base = [
        "Bank of America", "McDonald's", "Walmart Supercenter", "Home Depot", 
        "Starbucks Coffee", "7-Eleven", "CVS Pharmacy", "Target", "Amazon.com",
        "Costco Wholesale", "Apple Store", "Walgreens", "Best Buy", "Shell",
        "Chevron", "AT&T", "Verizon Wireless", "T-Mobile", "Wells Fargo Bank",
        "Chase Bank"
    ]
    
    # Create variations with common challenges
    merchant_variations = []
    expected_matches = []
    
    for i in range(n_samples):
        # Pick a random merchant
        if i < len(merchant_base):
            base = merchant_base[i % len(merchant_base)]
        else:
            base = merchant_base[np.random.randint(0, len(merchant_base))]
            
        # Randomly create a variation or non-match
        is_match = np.random.random() > 0.3  # 70% are matches
        
        if is_match:
            # Create a variation
            variation_type = np.random.choice([
                'abbreviation', 'typo', 'word_order', 'extra_words', 'missing_words'
            ])
            
            if variation_type == 'abbreviation':
                if 'Bank of America' in base:
                    variation = 'BOFA'
                elif "McDonald's" in base:
                    variation = "McD"
                elif "Starbucks" in base:
                    variation = "SBUX"
                elif "Walmart" in base:
                    variation = "WMT"
                else:
                    # Generic abbreviation: first letters of words
                    variation = ''.join([word[0] for word in base.split()])
            
            elif variation_type == 'typo':
                # Introduce a random typo
                chars = list(base)
                pos = np.random.randint(0, len(chars))
                if np.random.random() > 0.5:
                    # Replace a character
                    chars[pos] = np.random.choice(list(string.ascii_letters + ' '))
                else:
                    # Delete a character
                    chars.pop(pos)
                variation = ''.join(chars)
            
            elif variation_type == 'word_order':
                # Change word order
                words = base.split()
                if len(words) > 1:
                    np.random.shuffle(words)
                    variation = ' '.join(words)
                else:
                    # If single word, add a typo
                    chars = list(base)
                    pos = np.random.randint(0, len(chars))
                    chars[pos] = np.random.choice(list(string.ascii_letters))
                    variation = ''.join(chars)
            
            elif variation_type == 'extra_words':
                # Add extra words
                extras = ['Inc', 'LLC', 'Corporation', 'Co', 'Ltd', 'Store', 'Shop', 'Center']
                variation = base + ' ' + np.random.choice(extras)
            
            elif variation_type == 'missing_words':
                # Remove words
                words = base.split()
                if len(words) > 1:
                    words.pop(np.random.randint(0, len(words)))
                    variation = ' '.join(words)
                else:
                    # If single word, truncate
                    variation = base[:int(len(base)*0.7)]
        else:
            # Create a non-match by picking a different merchant
            other_merchants = [m for m in merchant_base if m != base]
            variation = np.random.choice(other_merchants)
            
        merchant_variations.append((base, variation))
        expected_matches.append(int(is_match))
    
    # Create a DataFrame
    df = pd.DataFrame({
        'original': [pair[0] for pair in merchant_variations],
        'variant': [pair[1] for pair in merchant_variations],
        'expected_binary': expected_matches
    })
    
    return df

# Generate sample data
merchant_data = create_sample_data(200)

# Display some examples
print(f"Sample dataset created with {len(merchant_data)} records")
print("\nSome matching examples:")
print(merchant_data[merchant_data['expected_binary'] == 1].head(5))
print("\nSome non-matching examples:")
print(merchant_data[merchant_data['expected_binary'] == 0].head(5))

In [None]:
#3
class MerchantPreprocessor:
    def __init__(self):
        # Comprehensive abbreviation dictionary
        self.abbreviations = {
            'bofa': 'bank of america', 
            'b of a': 'bank of america',
            'boa': 'bank of america', 
            'j&j': 'johnson and johnson',
            'jj': 'johnson johnson', 
            'jnj': 'johnson and johnson',
            'ibm': 'international business machines', 
            'amex': 'american express',
            'wf': 'wells fargo', 
            'wmt': 'walmart', 
            'sbux': 'starbucks',
            'hd': 'home depot', 
            'cvs': 'cvs pharmacy', 
            'mcd': 'mcdonalds',
            '7-11': '7-eleven', 
            '711': '7-eleven', 
            'rd': 'road',
            'st': 'street', 
            'ave': 'avenue', 
            'blvd': 'boulevard',
            'ctr': 'center', 
            'ln': 'lane', 
            'dr': 'drive'
        }
        
        # Business entity terms to remove
        self.stopwords = {
            'inc', 'llc', 'co', 'ltd', 'corp', 'corporation', 
            'plc', 'na', 'the', 'and', 'of', '&'
        }
    
    def preprocess(self, text):
        """Normalize merchant name text"""
        if not text or not isinstance(text, str):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove punctuation
        text = re.sub(r'[^\w\s]', ' ', text)
        
        # Replace multiple spaces with a single space
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Tokenize
        tokens = text.split()
        
        # Abbreviation replacement
        cleaned_tokens = []
        for token in tokens:
            if token in self.abbreviations:
                cleaned_tokens.append(self.abbreviations[token])
            else:
                cleaned_tokens.append(token)
        
        # Remove stopwords
        cleaned_tokens = [token for token in cleaned_tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(cleaned_tokens)
    
    def correct_typo(self, token, max_distance=2):
        """Attempt to correct typos using Levenshtein distance"""
        if not token or token in self.abbreviations.values():
            return token
        
        best_match = None
        min_dist = max_distance + 1
        
        for correct in self.abbreviations.values():
            # Only consider words of similar length to avoid false corrections
            if abs(len(token) - len(correct)) <= 2:
                dist = levenshtein_distance(token, correct)
                if dist < min_dist and dist <= max_distance:
                    min_dist = dist
                    best_match = correct
        
        return best_match if best_match else token

# Process the sample data
preprocessor = MerchantPreprocessor()
merchant_data['clean_original'] = merchant_data['original'].apply(preprocessor.preprocess)
merchant_data['clean_variant'] = merchant_data['variant'].apply(preprocessor.preprocess)

# Display sample of preprocessing results
results_df = merchant_data[['original', 'clean_original', 'variant', 'clean_variant', 'expected_binary']]
print("Example preprocessing results:")
print(results_df.head(10))

# Visualize the effect of preprocessing on name length
plt.figure(figsize=(10, 6))
plt.hist([merchant_data['original'].str.len(), merchant_data['clean_original'].str.len()], 
         bins=20, alpha=0.5, label=['Original', 'Preprocessed'])
plt.title('Effect of Preprocessing on Name Length')
plt.xlabel('Character Length')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
#4
class SimilarityFeatures:
    def __init__(self):
        self.tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 3))
    
    def jaro_winkler_sim(self, s1, s2):
        """Calculate Jaro-Winkler similarity between two strings"""
        if not s1 or not s2:
            return 0.0
        return jaro_winkler(s1, s2)
    
    def levenshtein_sim(self, s1, s2):
        """Calculate normalized Levenshtein similarity between two strings"""
        if not s1 or not s2:
            return 0.0
        max_len = max(len(s1), len(s2))
        if max_len == 0:
            return 1.0  # Both strings are empty
        return 1.0 - (levenshtein_distance(s1, s2) / max_len)
    
    def tfidf_cosine(self, s1, s2):
        """Calculate TF-IDF cosine similarity between two strings"""
        if not s1 or not s2:
            return 0.0
        try:
            # Fit and transform with both strings
            tfidf_matrix = self.tfidf.fit_transform([s1, s2])
            return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        except:
            return 0.0
    
    def jaccard_ngram(self, s1, s2, n=2):
        """Calculate Jaccard similarity of character n-grams"""
        if not s1 or not s2:
            return 0.0
        
        # Generate character n-grams
        s1_ngrams = set(s1[i:i+n] for i in range(len(s1)-n+1))
        s2_ngrams = set(s2[i:i+n] for i in range(len(s2)-n+1))
        
        if not s1_ngrams or not s2_ngrams:
            return 0.0
            
        # Calculate Jaccard similarity
        intersection = len(s1_ngrams.intersection(s2_ngrams))
        union = len(s1_ngrams.union(s2_ngrams))
        return intersection / union if union > 0 else 0.0
    
    def get_basic_features(self, s1, s2):
        """Extract all basic string similarity features"""
        return {
            'jaro_winkler': self.jaro_winkler_sim(s1, s2),
            'levenshtein': self.levenshtein_sim(s1, s2),
            'tfidf_cosine': self.tfidf_cosine(s1, s2),
            'jaccard_ngram': self.jaccard_ngram(s1, s2)
        }

# Extract basic features
feature_extractor = SimilarityFeatures()

# Calculate features for each merchant pair
basic_features = []
for _, row in merchant_data.iterrows():
    features = feature_extractor.get_basic_features(
        row['clean_original'], row['clean_variant'])
    basic_features.append(features)

# Convert to DataFrame and join with original data
basic_features_df = pd.DataFrame(basic_features)
merchant_data = pd.concat([merchant_data, basic_features_df], axis=1)

# Display the features
print("Sample of extracted basic features:")
print(merchant_data[['clean_original', 'clean_variant', 'jaro_winkler', 'levenshtein', 
                    'tfidf_cosine', 'jaccard_ngram', 'expected_binary']].head(10))

# Visualize the distribution of features by match status
plt.figure(figsize=(15, 10))

for i, feature in enumerate(['jaro_winkler', 'levenshtein', 'tfidf_cosine', 'jaccard_ngram']):
    plt.subplot(2, 2, i+1)
    for label, color in [(1, 'green'), (0, 'red')]:
        subset = merchant_data[merchant_data['expected_binary'] == label]
        plt.hist(subset[feature], bins=20, alpha=0.5, color=color, 
                 label=f'{"Match" if label==1 else "Non-match"}')
    plt.title(f'{feature.replace("_", " ").title()} Distribution')
    plt.xlabel('Similarity Score')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
#5
# Look at feature correlation
feature_columns = ['jaro_winkler', 'levenshtein', 'tfidf_cosine', 'jaccard_ngram']

# Correlation matrix
corr_matrix = merchant_data[feature_columns].corr()

# Visualize correlation
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.show()

# Visualize correlation of features with expected outcome
correlations = []
for feature in feature_columns:
    corr = merchant_data[feature].corr(merchant_data['expected_binary'])
    correlations.append((feature, corr))

# Sort correlations
correlations.sort(key=lambda x: abs(x[1]), reverse=True)

# Plot correlation with expected_binary
plt.figure(figsize=(10, 6))
features = [x[0] for x in correlations]
corr_values = [x[1] for x in correlations]
plt.barh(features, corr_values, color='skyblue')
plt.xlabel('Correlation with Expected Match')
plt.title('Feature Correlation with Expected Match')
plt.xlim(-1, 1)
plt.grid(True, alpha=0.3)
plt.show()

print("Feature correlations with expected match:")
for feature, corr in correlations:
    print(f"{feature}: {corr:.4f}")

In [None]:
#6
# Create feature matrix and target vector
X = merchant_data[feature_columns]
y = merchant_data['expected_binary']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Analyze basic thresholds for each feature
plt.figure(figsize=(15, 10))

for i, feature in enumerate(feature_columns):
    plt.subplot(2, 2, i+1)
    
    # Calculate precision, recall, and thresholds
    precision, recall, thresholds = precision_recall_curve(y_test, X_test[feature])
    
    # Calculate F1 scores
    f1_scores = 2 * precision * recall / (precision + recall + 1e-7)  # Avoid division by zero
    
    # Find best threshold based on F1 score
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0
    best_f1 = f1_scores[best_idx]
    
    # Plot curves
    plt.plot(thresholds, precision[:-1], 'b-', label='Precision')
    plt.plot(thresholds, recall[:-1], 'g-', label='Recall')
    plt.plot(thresholds, f1_scores[:-1], 'r-', label='F1 Score')
    
    # Mark best threshold
    plt.axvline(x=best_threshold, color='k', linestyle='--', 
                label=f'Best threshold: {best_threshold:.3f} (F1: {best_f1:.3f})')
    
    plt.title(f'{feature.replace("_", " ").title()} Threshold Analysis')
    plt.xlabel('Threshold')
    plt.ylabel('Score')
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print best thresholds for each feature
print("\nBest thresholds based on F1 score:")
for feature in feature_columns:
    precision, recall, thresholds = precision_recall_curve(y_test, X_test[feature])
    f1_scores = 2 * precision * recall / (precision + recall + 1e-7)
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0
    best_f1 = f1_scores[best_idx]
    print(f"{feature}: threshold = {best_threshold:.3f}, F1 = {best_f1:.3f}")

In [None]:
#7
# Create the XGBoost classifier
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    random_state=42
)

# Train the model
xgb_model.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

# Make predictions on test set
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba >= 0.5).astype(int)

# Print classification report
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred))

# Plot feature importance
plt.figure(figsize=(10, 6))
xgb.plot_importance(xgb_model)
plt.title('XGBoost Feature Importance')
plt.tight_layout()
plt.show()

# Visualize confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
#8
# Find the optimal probability threshold
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
f1_scores = 2 * precision * recall / (precision + recall + 1e-7)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0
best_f1 = f1_scores[best_idx]

# Plot precision-recall curve
plt.figure(figsize=(10, 6))
plt.plot(recall, precision, 'b-', label='Precision-Recall curve')
plt.plot(recall[best_idx], precision[best_idx], 'ro', 
         label=f'Best threshold: {best_threshold:.3f} (F1: {best_f1:.3f})')
plt.title('Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.axhline(y=np.mean(y_test), color='r', linestyle='--', 
            label=f'Baseline precision: {np.mean(y_test):.3f}')
plt.grid(True, alpha=0.3)
plt.legend(loc='best')
plt.show()

# Update predictions using best threshold
y_pred_optimized = (y_pred_proba >= best_threshold).astype(int)

# Print updated classification report
print(f"Classification Report with Optimized Threshold ({best_threshold:.3f}):")
print(classification_report(y_test, y_pred_optimized))

# Compare with individual features
print("\nComparison of F1 scores:")
print(f"XGBoost (default threshold): {f1_score(y_test, y_pred):.3f}")
print(f"XGBoost (optimized threshold): {f1_score(y_test, y_pred_optimized):.3f}")

for feature in feature_columns:
    # Find best threshold for this feature
    precision, recall, thresholds = precision_recall_curve(y_test, X_test[feature])
    f1_scores = 2 * precision * recall / (precision + recall + 1e-7)
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0
    
    # Make predictions using best threshold
    feature_preds = (X_test[feature] >= best_threshold).astype(int)
    feature_f1 = f1_score(y_test, feature_preds)
    
    print(f"{feature}: {feature_f1:.3f}")

In [None]:
#9
# Create DataFrame with test results
results_df = X_test.copy()
results_df['original'] = merchant_data.loc[X_test.index, 'original'].values
results_df['variant'] = merchant_data.loc[X_test.index, 'variant'].values
results_df['clean_original'] = merchant_data.loc[X_test.index, 'clean_original'].values
results_df['clean_variant'] = merchant_data.loc[X_test.index, 'clean_variant'].values
results_df['expected'] = y_test.values
results_df['predicted'] = y_pred_optimized
results_df['probability'] = y_pred_proba
results_df['is_error'] = results_df['expected'] != results_df['predicted']

# Look at errors
errors_df = results_df[results_df['is_error']]
print(f"Total errors: {len(errors_df)} out of {len(results_df)} test samples ({len(errors_df)/len(results_df)*100:.1f}%)")

# Display some false positives (predicted match, but not a match)
print("\nFalse Positives (predicted match, but not a match):")
false_positives = errors_df[errors_df['predicted'] == 1]
if len(false_positives) > 0:
    print(false_positives[['original', 'variant', 'probability']].head(5))
else:
    print("No false positives")

# Display some false negatives (predicted non-match, but is a match)
print("\nFalse Negatives (predicted non-match, but is a match):")
false_negatives = errors_df[errors_df['predicted'] == 0]
if len(false_negatives) > 0:
    print(false_negatives[['original', 'variant', 'probability']].head(5))
else:
    print("No false negatives")

# Analyze error patterns
if len(errors_df) > 0:
    # Calculate string lengths
    errors_df['orig_len'] = errors_df['original'].str.len()
    errors_df['var_len'] = errors_df['variant'].str.len()
    errors_df['len_diff'] = abs(errors_df['orig_len'] - errors_df['var_len'])
    
    # Analyze length difference correlation with errors
    plt.figure(figsize=(10, 6))
    plt.scatter(errors_df['len_diff'], errors_df['probability'], c='red', alpha=0.6)
    plt.title('Error Analysis: String Length Difference vs. Probability')
    plt.xlabel('Absolute Difference in String Length')
    plt.ylabel('Predicted Probability')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # Compare error distributions by feature
    plt.figure(figsize=(15, 10))
    for i, feature in enumerate(feature_columns):
        plt.subplot(2, 2, i+1)
        plt.scatter(errors_df[feature], errors_df['probability'], c='red', alpha=0.6)
        plt.title(f'Errors: {feature.replace("_", " ").title()} vs. Probability')
        plt.xlabel(feature)
        plt.ylabel('Predicted Probability')
        plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
#10
# This cell is conceptual/pseudo-code for PySpark implementation
# You would need to uncomment and adapt this for actual use with PySpark

'''
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType, FloatType, ArrayType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

# Initialize Spark
spark = SparkSession.builder \
    .appName("Merchant Matching") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Register UDFs for feature extraction
@udf(returnType=StringType())
def preprocess_text_udf(text):
    # Implement the preprocessing logic here
    # This is a simplified version
    if not text or not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

@udf(returnType=FloatType())
def jaro_winkler_udf(s1, s2):
    if not s1 or not s2:
        return 0.0
    return jaro_winkler(s1, s2)

@udf(returnType=FloatType())
def levenshtein_sim_udf(s1, s2):
    if not s1 or not s2:
        return 0.0
    max_len = max(len(s1), len(s2))
    if max_len == 0:
        return 1.0
    return 1.0 - (levenshtein_distance(s1, s2) / max_len)

# Load data
df = spark.read.parquet("path/to/merchant_data.parquet")

# Apply preprocessing
df = df.withColumn("clean_original", preprocess_text_udf(col("original")))
df = df.withColumn("clean_variant", preprocess_text_udf(col("variant")))

# Extract features
df = df.withColumn("jaro_winkler", jaro_winkler_udf(col("clean_original"), col("clean_variant")))
df = df.withColumn("levenshtein", levenshtein_sim_udf(col("clean_original"), col("clean_variant")))

# More features would be added here...

# Assemble features into a single vector
assembler = VectorAssembler(
    inputCols=["jaro_winkler", "levenshtein"],  # Add more features as needed
    outputCol="features"
)

# Build the model
gbt = GBTClassifier(
    featuresCol="features",
    labelCol="expected_binary",
    maxIter=10
)

# Create pipeline
pipeline = Pipeline(stages=[assembler, gbt])

# Split data
training_data, test_data = df.randomSplit([0.7, 0.3], seed=42)

# Train model
model = pipeline.fit(training_data)

# Make predictions
predictions = model.transform(test_data)

# Evaluate model
evaluator = BinaryClassificationEvaluator(
    rawPredictionCol="probability",
    labelCol="expected_binary"
)
auc = evaluator.evaluate(predictions)

print(f"AUC: {auc}")

# Save model
model.save("path/to/merchant_matching_model")
'''

print("This is a conceptual implementation for PySpark.")
print("To actually run this code, you would need to:")
print("1. Set up a Spark cluster or use a local Spark instance")
print("2. Have your merchant data in a compatible format (e.g., Parquet)")
print("3. Uncomment and possibly modify the code above")
print("4. Add more features as needed")

In [None]:
#11
# Save the model and preprocessor for future use
import pickle

# Create a directory to save model artifacts
import os
if not os.path.exists('model'):
    os.makedirs('model')

# Save the trained XGBoost model
xgb_model.save_model('model/merchant_matcher.xgb')

# Save the preprocessor
with open('model/preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

# Save the feature extractor
with open('model/feature_extractor.pkl', 'wb') as f:
    pickle.dump(feature_extractor, f)

# Save the optimal threshold
with open('model/optimal_threshold.txt', 'w') as f:
    f.write(str(best_threshold))

print("Model and pipeline components saved successfully!")

In [None]:
#12
# Example of how to load and use the model for new data
def load_model_components():
    # Load XGBoost model
    loaded_model = xgb.XGBClassifier()
    loaded_model.load_model('model/merchant_matcher.xgb')
    
    # Load preprocessor
    with open('model/preprocessor.pkl', 'rb') as f:
        loaded_preprocessor = pickle.load(f)
    
    # Load feature extractor
    with open('model/feature_extractor.pkl', 'rb') as f:
        loaded_feature_extractor = pickle.load(f)
    
    # Load optimal threshold
    with open('model/optimal_threshold.txt', 'r') as f:
        loaded_threshold = float(f.read().strip())
    
    return loaded_model, loaded_preprocessor, loaded_feature_extractor, loaded_threshold

# Load components
model, preprocessor, feature_extractor, threshold = load_model_components()

# Create sample data to test
test_pairs = [
    ("Bank of America", "BofA"),
    ("McDonald's", "Mcdonald's Restaurant"),
    ("Walmart", "Target"),  # Non-match
    ("7-Eleven", "7-11"),
    ("Home Depot", "The Home Depot Inc")
]

# Preprocess and extract features
results = []
for original, variant in test_pairs:
    # Preprocess
    clean_original = preprocessor.preprocess(original)
    clean_variant = preprocessor.preprocess(variant)
    
    # Extract features
    features = feature_extractor.get_basic_features(clean_original, clean_variant)
    
    # Convert to format expected by model
    feature_array = np.array([features[f] for f in feature_columns]).reshape(1, -1)
    
    # Predict
    probability = model.predict_proba(feature_array)[0, 1]
    prediction = 1 if probability >= threshold else 0
    
    results.append({
        "original": original,
        "variant": variant,
        "clean_original": clean_original,
        "clean_variant": clean_variant,
        "probability": probability,
        "prediction": prediction,
        "is_match": "Yes" if prediction == 1 else "No"
    })

# Display results
results_df = pd.DataFrame(results)
print("Prediction results for new merchant pairs:")
print(results_df[["original", "variant", "probability", "is_match"]])

In [None]:
#13
# Performance summary of our approach
summary = {
    "Model": "XGBoost Ensemble",
    "Features": ", ".join(feature_columns),
    "Best Threshold": best_threshold,
    "F1 Score": f1_score(y_test, y_pred_optimized),
    "Top Feature": feature_columns[np.argmax(xgb_model.feature_importances_)]
}

print("Model Performance Summary:")
for key, value in summary.items():
    print(f"{key}: {value}")

print("\nNext Steps for Further Improvement:")
print("1. Incorporate advanced semantic features using BERT embeddings")
print("2. Add phonetic matching with Soundex or Double Metaphone")
print("3. Expand the abbreviation dictionary and preprocessing rules")
print("4. Scale with PySpark for larger datasets")
print("5. Implement active learning to improve from user feedback")
print("6. Deploy as a service with FastAPI or Flask")