# Cluster based anonymization
The goal of this notebook is to demostrate how cluster based anonymization looks like.

In [None]:
import numpy as np # Linear algebra
import pandas as pd # Data processing, CSV file
import geopandas as gpd # Geocharting
import matplotlib.pyplot as plt # Plotting
from sklearn.cluster import KMeans # Clustering

# Configuration
DATASET_LOCATION = '/kaggle/input/volcanic-eruptions/database.csv' # Download at https://www.kaggle.com/smithsonian/volcanic-eruptions
DATAPOINTS_PER_CLUSTER = 300

## Load the chart of the world

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

fig, ax = plt.subplots(1, 1, figsize=(10,10))
world.plot(ax=ax, color='gray')

## Load the dataset & show data location

In [None]:
df = gpd.read_file(DATASET_LOCATION)
df['geometry'] = gpd.points_from_xy(df['Longitude'], df['Latitude'])
df.tail()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,10))

world.plot(ax=ax, color='gray')
df.plot(ax=ax, c='b', alpha=0.5)

In [None]:
cluster_count = df.shape[0] // DATAPOINTS_PER_CLUSTER

print(f'Cluster counter: {cluster_count}')

kmeans = KMeans(n_clusters=cluster_count)
labels = kmeans.fit_predict(df[['Longitude', 'Latitude']])

centroids = kmeans.cluster_centers_
print('centroid locations:\n', centroids)

df['Location Cluster'] = labels

## Show clusters
Each red dot is the location of the centroid.
The coloured dots around are the datapoints which belong to it.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,10))
world.plot(ax=ax, color='gray')

df.plot(df['Location Cluster'], ax=ax, cmap='cool', alpha=1)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=75)
plt.show()

## Next steps
- Fine tune the amount of datapoints per cluster without losing too much corolation data.
- Remove the location data from the dataframe and export the new created dataset.