In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches
import scipy.spatial as spatial
from random import randint
import random

In [None]:
data_set = 'yolo'

if data_set == 'iris':
    from sklearn import datasets
    iris = datasets.load_iris()
    x = iris.data[:, :2]
    df = pd.DataFrame().from_dict({'x': x[:,0], 'y': x[:,1]})
    # DBSCAN parameter
    eps = 0.2
    min_pts = 8
else: 
    df = pd.read_csv('https://raw.githubusercontent.com/lnxdxC/DSAI/main/L03_Clustering/DBSCAN_data.csv')
    # DBSCAN parameter
    eps = 2
    min_pts = 3
    
# Search tree
search_tree = spatial.cKDTree(np.c_[df.x, df.y])

In [None]:
# Allocate new channels to store cluster information
df['id'] = np.nan
df['color'] = '#000000'

In [None]:
# Show the data set
df.plot.scatter(x='x', y='y', s=50, alpha=0.5);

# DBSCAN
Density-Based Spatial Clustering of Applications with Noise (DBSCAN) has 2 core parameters which have to be assigned before we can start: 
1. $\epsilon$
2. ```min_pts```

Here $\epsilon$ represents the radius of a circle around a particular point within the dataset and ```min_pts``` denotes a threshold used in the clustering process to distinguish between a cluster, a border, and a noise point.

# sklearn implementation

## Large scale DBSCAN

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/lnxdxC/DSAI/main/L03_Clustering/t48k.csv')
X = np.array([df.x, df.y]).T

In [None]:
X = np.array([df.x, df.y]).T

In [None]:
db = DBSCAN(eps=8, min_samples=15).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

In [None]:
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]

fig, ax = plt.subplots(figsize=(10, 5))

for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = labels == k

    xy = X[class_member_mask & core_samples_mask]
    ax.plot( xy[:, 0], xy[:, 1], "o", markerfacecolor=tuple(col), markeredgecolor="k", markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    ax.plot(xy[:, 0], xy[:, 1], "o", markerfacecolor=tuple(col), markeredgecolor="k", markersize=6)
plt.show()