# Data Deduplication using Clustering
**Objective**: Learn and implement data deduplication techniques.

**Task**: DBSCAN for Data Deduplication

**Steps**:
1. Data Set: Download a dataset containing duplicate entries for event registrations.
2. DBSCAN Clustering: Apply the DBSCAN algorithm to cluster similar registrations.
3. Identify Duplicates: Detect duplicates based on density of the clusters.
4. Refinement: Validate clusters and remove any erroneous duplicates.

In [None]:
# write your code from here


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
import warnings

warnings.filterwarnings("ignore")

# Step 1: Simulate event registration dataset with duplicates
def generate_event_data():
    return pd.DataFrame({
        'Name': [
            'Alice Smith', 'Bob Jones', 'Charlie Ray', 'Alice Smith',
            'David Lee', 'Charlie Ray', 'Eva White', 'Frank West',
            'Eva White', 'George King'
        ],
        'Email': [
            'alice@example.com', 'bob@example.com', 'charlie@example.com', 'alice@example.com',
            'david@example.com', 'charlie@example.com', 'eva@example.com', 'frank@example.com',
            'eva@example.com', 'george@example.com'
        ],
        'Phone': ['12345', '23456', '34567', '12345', '45678', '34567', '56789', '67890', '56789', '78901']
    })

# Step 2: Preprocess text using TF-IDF + normalize numerical
def preprocess_data(df):
    try:
        vectorizer = TfidfVectorizer()
        text_features = vectorizer.fit_transform(df['Name'] + " " + df['Email'])
        numeric_features = StandardScaler().fit_transform(df[['Phone']].astype(float))
        combined = np.hstack((text_features.toarray(), numeric_features))
        return df.copy().reset_index(drop=True), combined
    except Exception as e:
        raise ValueError(f"Preprocessing failed: {e}")

# Step 3: Apply DBSCAN
def perform_dbscan(data, eps=0.5, min_samples=2):
    try:
        model = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
        labels = model.fit_predict(data)
        return labels
    except Exception as e:
        raise ValueError(f"DBSCAN clustering failed: {e}")

# Step 4: Deduplicate data based on DBSCAN clusters
def deduplicate_by_cluster(df, labels):
    try:
        df['Cluster'] = labels
        deduped = df[df['Cluster'] != -1]  # Keep clustered points only
        deduped_cleaned = deduped.groupby('Cluster').first().reset_index()
        return deduped_cleaned.drop(columns=['Cluster'])
    except Exception as e:
        raise ValueError(f"Deduplication failed: {e}")

# Optional: Show cluster assignment for debugging
def show_cluster_info(df, labels):
    df['Cluster'] = labels
    print("\nCluster Assignments:\n", df.sort_values(by='Cluster'))

# Main workflow
def main():
    print("Generating event registration data...\n")
    df = generate_event_data()
    print("Original Data:\n", df)

    print("\nPreprocessing data...")
    df_clean, processed_data = preprocess_data(df)

    print("\nApplying DBSCAN clustering...")
    labels = perform_dbscan(processed_data, eps=0.7, min_samples=2)

    print("\nReviewing cluster assignments...")
    show_cluster_info(df_clean, labels)

    print("\nDeduplicating records...")
    deduped_df = deduplicate_by_cluster(df_clean, labels)

    print("\nFinal Deduplicated Data:\n", deduped_df)

# Run
main()


Generating event registration data...

Original Data:
           Name                Email  Phone
0  Alice Smith    alice@example.com  12345
1    Bob Jones      bob@example.com  23456
2  Charlie Ray  charlie@example.com  34567
3  Alice Smith    alice@example.com  12345
4    David Lee    david@example.com  45678
5  Charlie Ray  charlie@example.com  34567
6    Eva White      eva@example.com  56789
7   Frank West    frank@example.com  67890
8    Eva White      eva@example.com  56789
9  George King   george@example.com  78901

Preprocessing data...

Applying DBSCAN clustering...

Reviewing cluster assignments...

Cluster Assignments:
           Name                Email  Phone  Cluster
1    Bob Jones      bob@example.com  23456       -1
4    David Lee    david@example.com  45678       -1
7   Frank West    frank@example.com  67890       -1
9  George King   george@example.com  78901       -1
0  Alice Smith    alice@example.com  12345        0
3  Alice Smith    alice@example.com  12345       