# Data Deduplication using Clustering
**Objective**: Learn and implement data deduplication techniques.

**Task**: Hierarchical Clustering for Deduplication

**Steps**:
1. Data Set: Obtain a dataset containing duplicate employee information.
2. Perform Clustering: Use hierarchical agglomerative clustering to cluster the employee
records.
3. Evaluate Duplicates: Determine duplicates by analyzing the clusters formed.
4. Clean Data: Remove duplicate employee records found during clustering.

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from scipy.cluster.hierarchy import dendrogram
import matplotlib.pyplot as plt

# Step 1: Generate a dataset containing duplicate employee information (replace with your actual data)
np.random.seed(42)
num_records = 80
data = {
    'EmployeeID': np.arange(1, num_records + 1),
    'Name': [f'Employee {i}' for i in range(1, num_records + 1)],
    'Age': np.random.randint(22, 55, num_records),
    'Salary': np.random.randint(40000, 120000, num_records),
    'Department': np.random.choice(['HR', 'Engineering', 'Sales', 'Marketing'], num_records)
}
df = pd.DataFrame(data)

# Introduce some duplicates with slight variations
df_duplicates = pd.DataFrame({
    'EmployeeID': [81, 82, 83, 84],
    'Name': ['Employee 12', 'Employe 25', 'Employee 48', 'Employee 61'], # Slight typos
    'Age': [30, 41, 28, 49],
    'Salary': [75000, 92000, 68000, 110000],
    'Department': ['HR', 'Engineering', 'Sales', 'Marketing']
})

df = pd.concat([df, df_duplicates], ignore_index=True)
np.random.shuffle(df.values) # Shuffle the order

print("Original DataFrame with Potential Duplicates:")
print(df.head())

# Step 2: Perform Clustering: Use hierarchical agglomerative clustering to cluster the employee records.
# Select numerical features for clustering
numerical_features = ['Age', 'Salary']
X = df[numerical_features].copy()

# Standardize the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=numerical_features)

# Perform hierarchical clustering
# Choose the number of clusters or a distance threshold
n_clusters = 10 # You might need to determine this based on a dendrogram or domain knowledge
agg_clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward') # 'ward' minimizes variance within clusters
df['Cluster'] = agg_clustering.fit_predict(X_scaled_df)

print("\nDataFrame with Cluster Assignments:")
print(df.head())

# Optional: Visualize the dendrogram to help determine the number of clusters
# linked = linkage(X_scaled, 'ward')
# plt.figure(figsize=(12, 6))
# dendrogram(linked, orientation='top')
# plt.title('Hierarchical Clustering Dendrogram')
# plt.xlabel('Employee Records')
# plt.ylabel('Distance')
# plt.show()

# Step 3: Evaluate Duplicates: Determine duplicates by analyzing the clusters formed.
def identify_duplicates_hierarchical(cluster_df):
    if len(cluster_df) <= 1:
        return cluster_df
    # Calculate pairwise distances based on the scaled numerical features
    numerical_cols = ['Age', 'Salary']
    distances = pairwise_distances(cluster_df[numerical_cols], metric='euclidean')
    # Set a threshold for considering records as duplicates (you might need to tune this)
    similarity_threshold = 1.5
    duplicates_to_drop = set()
    for i in range(len(cluster_df)):
        for j in range(i + 1, len(cluster_df)):
            if distances[i, j] < similarity_threshold:
                # Keep the record with the lower EmployeeID as the representative
                index_to_drop = cluster_df.iloc[j].name
                duplicates_to_drop.add(index_to_drop)
    return cluster_df.drop(index=duplicates_to_drop)

# Group by cluster and apply the duplicate identification function
df_potential_duplicates = df.groupby('Cluster', group_keys=False).apply(identify_duplicates_hierarchical).reset_index(drop=True)

print("\nPotential Duplicates Identified within Clusters:")
print(df_potential_duplicates.head())

# Step 4: Clean Data: Remove duplicate employee records found during clustering.
# We can consider the records in df_potential_duplicates as the unique records
df_deduplicated = df.loc[df_potential_duplicates.index].reset_index(drop=True)

print("\nDeduplicated DataFrame:")
print(df_deduplicated.head())
print(f"\nNumber of original records: {len(df)}")
print(f"Number of deduplicated records: {len(df_deduplicated.drop_duplicates(subset=['Name', 'Age', 'Salary', 'Department']))}")

Original DataFrame with Potential Duplicates:
   EmployeeID        Name  Age  Salary Department
0           1  Employee 1   50   70535  Marketing
1           2  Employee 2   36  118603      Sales
2           3  Employee 3   29   92256      Sales
3           4  Employee 4   42   75222  Marketing
4           5  Employee 5   40  117373      Sales

DataFrame with Cluster Assignments:
   EmployeeID        Name  Age  Salary Department  Cluster
0           1  Employee 1   50   70535  Marketing        8
1           2  Employee 2   36  118603      Sales        7
2           3  Employee 3   29   92256      Sales        3
3           4  Employee 4   42   75222  Marketing        9
4           5  Employee 5   40  117373      Sales        7

Potential Duplicates Identified within Clusters:
   EmployeeID        Name  Age  Salary Department  Cluster
0           1  Employee 1   50   70535  Marketing        8
1           2  Employee 2   36  118603      Sales        7
2           3  Employee 3   29   922