# Data Deduplication using Clustering
**Objective**: Learn and implement data deduplication techniques.

**Task**: Deduplication Using K-means Clustering

**Steps**:
1. Data Set: Download a dataset containing duplicate customer records.
2. Preprocess: Standardize the data to ensure better clustering.
3. Apply K-means: Use K-means clustering to find and group similar customer records.
4. Identify Duplicates: Identify and remove duplicates within clusters.

In [None]:
# write your code from here


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
import random

# Step 1: Simulate sample customer dataset with duplicates
def generate_customer_data():
    data = {
        'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob', 'David', 'Charlie', 'Eve', 'Frank', 'Eve'],
        'Email': [
            'alice@mail.com', 'bob@mail.com', 'charlie@mail.com', 'alice@mail.com',
            'bob@mail.com', 'david@mail.com', 'charlie@mail.com', 'eve@mail.com',
            'frank@mail.com', 'eve@mail.com'
        ],
        'Phone': [
            '123', '456', '789', '123', '456', '321', '789', '654', '987', '654'
        ]
    }
    return pd.DataFrame(data)

# Step 2: Preprocess and standardize the data
def preprocess_data(df):
    df_numeric = df[['Phone']].astype(float)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df_numeric)
    return scaled_data, scaler

# Step 3: Apply K-means clustering
def cluster_data(scaled_data, k=5):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)
    return kmeans.labels_, kmeans

# Step 4: Identify duplicates within each cluster
def deduplicate(df, labels, scaled_data):
    df['Cluster'] = labels
    unique_records = []

    for cluster in np.unique(labels):
        cluster_indices = df[df['Cluster'] == cluster].index
        cluster_data = scaled_data[cluster_indices]
        closest, _ = pairwise_distances_argmin_min(
            cluster_data,
            [np.mean(cluster_data, axis=0)]
        )
        unique_index = cluster_indices[closest[0]]
        unique_records.append(df.loc[unique_index])

    dedup_df = pd.DataFrame(unique_records).drop(columns='Cluster')
    return dedup_df

# -------- Main Function --------
def main():
    df = generate_customer_data()
    print("Original Data:\n", df)

    scaled_data, scaler = preprocess_data(df)
    labels, kmeans = cluster_data(scaled_data, k=5)
    deduplicated_df = deduplicate(df, labels, scaled_data)

    print("\nDeduplicated Data:\n", deduplicated_df)

# Run the full process
main()


Original Data:
       Name             Email Phone
0    Alice    alice@mail.com   123
1      Bob      bob@mail.com   456
2  Charlie  charlie@mail.com   789
3    Alice    alice@mail.com   123
4      Bob      bob@mail.com   456
5    David    david@mail.com   321
6  Charlie  charlie@mail.com   789
7      Eve      eve@mail.com   654
8    Frank    frank@mail.com   987
9      Eve      eve@mail.com   654

Deduplicated Data:
       Name             Email Phone
0    Alice    alice@mail.com   123
7      Eve      eve@mail.com   654
2  Charlie  charlie@mail.com   789
1      Bob      bob@mail.com   456
8    Frank    frank@mail.com   987
