# Data Deduplication using Clustering
**Objective**: Learn and implement data deduplication techniques.

**Task**: Deduplication Using K-means Clustering

**Steps**:
1. Data Set: Download a dataset containing duplicate customer records.
2. Preprocess: Standardize the data to ensure better clustering.
3. Apply K-means: Use K-means clustering to find and group similar customer records.
4. Identify Duplicates: Identify and remove duplicates within clusters.

In [None]:
# write your code from here


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

# Simulate sample customer dataset with duplicates
def generate_customer_data():
    return pd.DataFrame({
        'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob', 'David', 'Charlie', 'Eve', 'Frank', 'Eve'],
        'Email': [
            'alice@mail.com', 'bob@mail.com', 'charlie@mail.com', 'alice@mail.com',
            'bob@mail.com', 'david@mail.com', 'charlie@mail.com', 'eve@mail.com',
            'frank@mail.com', 'eve@mail.com'
        ],
        'Phone': ['123', '456', '789', '123', '456', '321', '789', '654', '987', '654']
    })

# Preprocess and standardize data
def preprocess_data(df):
    try:
        if 'Phone' not in df.columns:
            raise KeyError("Missing required column: 'Phone'")
        df['Phone'] = pd.to_numeric(df['Phone'], errors='coerce')
        if df['Phone'].isnull().any():
            raise ValueError("Non-numeric or missing phone values detected.")
        scaled = StandardScaler().fit_transform(df[['Phone']])
        return scaled
    except Exception as e:
        print(f"Data preprocessing error: {e}")
        return None

# K-means clustering
def apply_kmeans(data, n_clusters=5):
    try:
        if data is None or len(data) == 0:
            raise ValueError("Empty or invalid input data for clustering.")
        model = KMeans(n_clusters=n_clusters, random_state=42)
        labels = model.fit_predict(data)
        return labels, model
    except Exception as e:
        print(f"Clustering error: {e}")
        return None, None

# Deduplication based on clustering
def deduplicate_records(df, data, labels):
    try:
        if labels is None:
            raise ValueError("Missing cluster labels for deduplication.")
        df['Cluster'] = labels
        deduplicated = []
        for cluster_id in np.unique(labels):
            cluster_indices = df[df['Cluster'] == cluster_id].index
            cluster_data = data[cluster_indices]
            closest_idx, _ = pairwise_distances_argmin_min(
                cluster_data,
                [np.mean(cluster_data, axis=0)]
            )
            deduplicated.append(df.loc[cluster_indices[closest_idx[0]]])
        return pd.DataFrame(deduplicated).drop(columns='Cluster')
    except Exception as e:
        print(f"Deduplication error: {e}")
        return df

# Main workflow with batch/step handling
def main():
    print("Loading data...")
    df = generate_customer_data()
    print("Original Data:\n", df)

    print("\nPreprocessing data...")
    scaled_data = preprocess_data(df)
    
    print("\nClustering...")
    labels, model = apply_kmeans(scaled_data, n_clusters=5)

    print("\nRemoving duplicates...")
    dedup_df = deduplicate_records(df, scaled_data, labels)

    print("\nDeduplicated Data:\n", dedup_df)

# Execute
main()


Loading data...
Original Data:
       Name             Email Phone
0    Alice    alice@mail.com   123
1      Bob      bob@mail.com   456
2  Charlie  charlie@mail.com   789
3    Alice    alice@mail.com   123
4      Bob      bob@mail.com   456
5    David    david@mail.com   321
6  Charlie  charlie@mail.com   789
7      Eve      eve@mail.com   654
8    Frank    frank@mail.com   987
9      Eve      eve@mail.com   654

Preprocessing data...

Clustering...

Removing duplicates...

Deduplicated Data:
       Name             Email  Phone
0    Alice    alice@mail.com    123
7      Eve      eve@mail.com    654
2  Charlie  charlie@mail.com    789
1      Bob      bob@mail.com    456
8    Frank    frank@mail.com    987
