In [None]:
# Question: Advanced Deduplication Using Machine Learning
# Description: Implement ML-based deduplication based on feature similarity.




In [3]:
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Sample data
df = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['John Smith', 'Jon Smith', 'Jane Doe', 'J. Smith'],
    'email': ['john@example.com', 'jsmith@example.com', 'jane.d@example.com', 'john.smith@gmail.com']
})

# Manually labeled training data (for simplicity)
# 1 if duplicate, 0 otherwise
# These should be prepared from historical duplicates or labeled data
labeled_pairs = [
    (0, 1, 1),  # John Smith vs Jon Smith = duplicate
    (0, 2, 0),  # John Smith vs Jane Doe = not duplicate
    (1, 2, 0),  # Jon Smith vs Jane Doe = not duplicate
    (0, 3, 1),  # John Smith vs J. Smith = duplicate
]

# Create a DataFrame of labeled pairs
train_data = []
for i, j, label in labeled_pairs:
    row1, row2 = df.iloc[i], df.iloc[j]
    train_data.append({
        'name1': row1['name'],
        'name2': row2['name'],
        'email1': row1['email'],
        'email2': row2['email'],
        'label': label
    })
train_df = pd.DataFrame(train_data)

# Feature engineering: string similarity
def string_similarity(a, b):
    vectorizer = TfidfVectorizer().fit([a, b])
    vecs = vectorizer.transform([a, b])
    return cosine_similarity(vecs[0], vecs[1])[0][0]

def compute_features(row):
    return pd.Series({
        'name_sim': string_similarity(row['name1'], row['name2']),
        'email_sim': string_similarity(row['email1'], row['email2'])
    })

features = train_df.apply(compute_features, axis=1)
X = features
y = train_df['label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

# ---------------------
# ✅ Predict on all possible pairs
# ---------------------
all_pairs = list(combinations(df.index, 2))

pair_predictions = []
for i, j in all_pairs:
    row1, row2 = df.iloc[i], df.iloc[j]
    name_sim = string_similarity(row1['name'], row2['name'])
    email_sim = string_similarity(row1['email'], row2['email'])
    features_df = pd.DataFrame([[name_sim, email_sim]], columns=X.columns)
    pred = model.predict(features_df)[0]
    prob = model.predict_proba(features_df)[0][1]
    pair_predictions.append({
        'id1': row1['id'],
        'id2': row2['id'],
        'name1': row1['name'],
        'name2': row2['name'],
        'email1': row1['email'],
        'email2': row2['email'],
        'is_duplicate': pred,
        'confidence': round(prob, 2)
    })

result_df = pd.DataFrame(pair_predictions)
print("\n=== Predicted Duplicate Pairs ===")
print(result_df[result_df['is_duplicate'] == 1])



=== Classification Report ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2


=== Predicted Duplicate Pairs ===
   id1  id2       name1      name2              email1                email2  \
0    1    2  John Smith  Jon Smith    john@example.com    jsmith@example.com   
2    1    4  John Smith   J. Smith    john@example.com  john.smith@gmail.com   
4    2    4   Jon Smith   J. Smith  jsmith@example.com  john.smith@gmail.com   

   is_duplicate  confidence  
0             1        0.78  
2             1        0.78  
4             1        0.78  
