In [1]:
# Question: Advanced Deduplication Using Machine Learning
# Description: Implement ML-based deduplication based on feature similarity.




In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
from itertools import combinations
import numpy as np

def create_feature_pairs(df, column1, column2, similarity_function):
    """Creates pairs of features and calculates their similarity."""
    sim_features = []
    for idx, row in df.iterrows():
        val1 = row[column1]
        val2 = row[column2]
        if isinstance(val1, str) and isinstance(val2, str):
            similarity = similarity_function(val1, val2)
        elif pd.isna(val1) or pd.isna(val2):
            similarity = 0.0  # Treat missing as no similarity
        else:
            similarity = 1.0 if val1 == val2 else 0.0 # For non-string exact matches
        sim_features.append(similarity)
    return sim_features

def generate_pairs(df):
    """Generates all unique pairs of records in the DataFrame."""
    pairs = list(combinations(df.index, 2))
    return pairs

def calculate_pair_features(df, pairs, text_columns, numeric_columns):
    """Calculates similarity features for pairs of records."""
    pair_features = []
    for idx1, idx2 in pairs:
        features = []
        record1 = df.loc[idx1]
        record2 = df.loc[idx2]

        # Text Similarity using TF-IDF and Cosine Similarity
        for col in text_columns:
            tfidf_vectorizer = TfidfVectorizer().fit_transform([str(record1[col]), str(record2[col])])
            similarity = cosine_similarity(tfidf_vectorizer[0], tfidf_vectorizer[1])[0][0]
            features.append(similarity)

        # String Similarity using FuzzyWuzzy
        for col in text_columns:
            ratio = fuzz.ratio(str(record1[col]), str(record2[col])) / 100.0
            partial_ratio = fuzz.partial_ratio(str(record1[col]), str(record2[col])) / 100.0
            token_sort_ratio = fuzz.token_sort_ratio(str(record1[col]), str(record2[col])) / 100.0
            features.extend([ratio, partial_ratio, token_sort_ratio])

        # Numeric Difference (can be normalized)
        for col in numeric_columns:
            diff = abs(record1[col] - record2[col])
            features.append(diff) # Consider normalizing by the range of the column

        pair_features.append(features)
    return np.array(pair_features)

def create_pairs_with_labels(df, duplicate_indices):
    """Generates pairs of records and assigns labels (1 for duplicate, 0 for not)."""
    pairs = []
    labels = []
    all_indices = list(df.index)

    # Label known duplicates as 1
    for i in range(len(duplicate_indices)):
        for j in range(i + 1, len(duplicate_indices)):
            idx1 = duplicate_indices[i]
            idx2 = duplicate_indices[j]
            if idx1 in all_indices and idx2 in all_indices:
                pairs.append((idx1, idx2))
                labels.append(1)

    # Generate some non-duplicate pairs (you might want a more sophisticated sampling strategy)
    non_duplicate_count = len(labels) * 2 # Example: twice as many non-duplicates
    non_duplicate_pairs = np.random.choice(all_indices, size=(non_duplicate_count, 2), replace=False)
    for idx1, idx2 in non_duplicate_pairs:
        if idx1 != idx2 and (idx1, idx2) not in pairs and (idx2, idx1) not in pairs:
            is_duplicate = False
            for i in range(len(duplicate_indices)):
                for j in range(i + 1, len(duplicate_indices)):
                    if (idx1 == duplicate_indices[i] and idx2 == duplicate_indices[j]) or \
                       (idx1 == duplicate_indices[j] and idx2 == duplicate_indices[i]):
                        is_duplicate = True
                        break
                if is_duplicate:
                    break
            if not is_duplicate:
                pairs.append((idx1, idx2))
                labels.append(0)

    return pairs, np.array(labels)

def train_deduplication_model(df, duplicate_indices, text_columns, numeric_columns):
    """Trains a machine learning model to predict duplicate pairs."""
    pairs, labels = create_pairs_with_labels(df.copy(), duplicate_indices)
    pair_features = calculate_pair_features(df.copy(), pairs, text_columns, numeric_columns)

    X_train, X_test, y_train, y_test = train_test_split(pair_features, labels, test_size=0.2, random_state=42, stratify=labels)

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print("Classification Report (Deduplication Model):")
    print(classification_report(y_test, y_pred))

    return model

def predict_duplicates(df, model, text_columns, numeric_columns, threshold=0.7):
    """Predicts potential duplicate pairs in the DataFrame using the trained model."""
    potential_duplicates = []
    pairs = generate_pairs(df)
    pair_features = calculate_pair_features(df, pairs, text_columns, numeric_columns)
    predictions = model.predict_proba(pair_features)[:, 1] # Probability of being a duplicate

    for i, (idx1, idx2) in enumerate(pairs):
        if predictions[i] >= threshold:
            potential_duplicates.append((idx1, idx2, predictions[i]))

    return potential_duplicates

if __name__ == '__main__':
    # Sample Data with some known duplicates (for training)
    data = {'name': ['Alice Smith', 'Bob Johnson', 'Charlie Brown', 'Alice Smith', 'Bob Jonson', 'David White'],
            'email': ['alice.smith@example.com', 'bob.johnson@example.com', 'charlie.brown@example.org', 'a.smith@example.com', 'bob.j@example.com', 'david.white@sample.net'],
            'age': [30, 45, 22, 30, 46, 50],
            'city': ['New York', 'Los Angeles', 'New York', 'NY', 'LA', 'Chicago']}
    df = pd.DataFrame(data)

    # Indices of known duplicate records (for training the model)
    duplicate_indices = [0, 3, 1, 4] # Alice Smith variations, Bob Johnson variations

    text_cols = ['name', 'email', 'city']
    numeric_cols = ['age']

    # Train the deduplication model
    deduplication_model = train_deduplication_model(df.copy(), duplicate_indices, text_cols, numeric_cols)

    # Predict potential duplicates in the same dataset
    potential_duplicates = predict_duplicates(df.copy(), deduplication_model, text_cols, numeric_cols, threshold=0.8)

    print("\nPotential Duplicate Pairs (with probability >= 0.8):")
    for idx1, idx2, prob in potential_duplicates:
        print(f"Record {idx1}: {df.loc[idx1].to_dict()}")
        print(f"Record {idx2}: {df.loc[idx2].to_dict()} (Probability: {prob:.2f})")
        print("-" * 20)

    # --- Applying to a new, larger dataset (example) ---
    larger_data = {'name': ['Alice Smith', 'Bob Johnson', 'Charlie Brown', 'Alice Smth', 'Robert Johnsen', 'David White', 'Charlie Braun'],
                   'email': ['alice.smith@example.com', 'bob.johnson@example.com', 'charlie.brown@example.org', 'asmith@example.com', 'r.johnsen@example.com', 'david.white@sample.net', 'c.brown@example.org'],
                   'age': [30, 45, 22, 31, 44, 50, 23],
                   'city': ['New York', 'Los Angeles', 'New York', 'New York', 'LA', 'Chicago', 'NYC']}
    larger_df = pd.DataFrame(larger_data)

    predicted_duplicates_larger = predict_duplicates(larger_df.copy(), deduplication_model, text_cols, numeric_cols, threshold=0.7)

    print("\nPotential Duplicate Pairs in Larger Dataset (with probability >= 0.7):")
    for idx1, idx2, prob in predicted_duplicates_larger:
        print(f"Record {idx1}: {larger_df.loc[idx1].to_dict()}")
        print(f"Record {idx2}: {larger_df.loc[idx2].to_dict()} (Probability: {prob:.2f})")
        print("-" * 20)

ModuleNotFoundError: No module named 'fuzzywuzzy'