## DBSCAN from scratch

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import make_blobs
from sklearn.neighbors import NearestNeighbors

sns.set_theme()

## Create the dataset

In [None]:
# Create dataset
X, y = make_blobs(n_samples=1000, n_features=2, random_state=4, centers=[[-4,-4], [4,4], [-4,4]])

# Add two outliers
X = np.vstack([X, np.array([[4,-4], [6,-4]])])
y = np.concatenate([y, np.array([-1,-1])])

# Convert to data frame
X_df = pd.DataFrame(X, columns=['dim1', 'dim2'])

# Plot the blobs
sns.scatterplot(X_df, x='dim1', y='dim2', hue=y)

### Radius neighbors

In [None]:
nearest_neigh = NearestNeighbors()
nearest_neigh = nearest_neigh.fit(X)

In [None]:
example_data_point_id = 0
np.expand_dims(X[example_data_point_id], axis=0)

In [None]:
nearest_neigh.radius_neighbors(
    np.expand_dims(X[example_data_point_id], axis=0), 
    radius=0.2, 
    return_distance=True
)

In [None]:
nearest_neigh.radius_neighbors(
    X[[0,1], :], 
    radius=0.2, 
    return_distance=False
)

### DBSCAN

In [None]:
# Set main parameters
eps = 1
min_samples = 5

In [None]:
# Create dataframe to store results
point_df = pd.DataFrame(index=X_df.index)

point_df['cluster'] = None
point_df['unvisited'] = True
point_df['core'] = False
point_df['border'] = False


point_df.head()

In [None]:
# Id of first cluster is 0
current_cluster = 0

# Outter loop iterates through clusters
while True:
    
    # Check if there are unvisited points left
    if point_df['unvisited'].sum()==0:
        break
    
    # Get first available unvisited point
    curr_point = point_df[point_df['unvisited']].index[0]
    
    # Get neighbors of selected point within eps radius
    # cluster_points set will be used to add cluster candidates
    cluster_points = set(
        nearest_neigh.radius_neighbors(
            np.expand_dims(X[curr_point], axis=0), 
            radius=eps, 
            return_distance=False
            )[0]
        )
    
    # If point has enough neighbors within eps radius
    # then start building the cluster
    if len(cluster_points) >= min_samples:
        
        # Assign cluster id to the current data point
        point_df.loc[curr_point, 'cluster'] = current_cluster
        point_df.loc[curr_point, 'core'] = True
        
        cluster_points = {point for point in cluster_points if point_df.loc[point, 'unvisited']}
        
        # Cluster is built in this loop
        while True:
            
            # Select one of the candiate points
            curr_point = cluster_points.pop()
            
            point_df.loc[curr_point, 'unvisited'] = False
            point_df.loc[curr_point, 'cluster'] = current_cluster
            
            # Get all neighbors of curr_point that are within eps distance
            curr_cluster_points = set(
                nearest_neigh.radius_neighbors(
                    np.expand_dims(X[curr_point], axis=0), 
                    radius=eps,
                    return_distance=False
                    )[0]
                )
            
            # If number of obtained neighbors is not sufficient
            # then data point is border data point
            if len(curr_cluster_points) < min_samples:
                
                point_df.loc[curr_point, 'border'] = True

            # If point has sufficient neighborhood:
            else:
                
                point_df.loc[curr_point, 'core'] = True
                
                # Add point to the current cluster
                # Get all of the point neighbors within eps distance
                add_points = {point for point in curr_cluster_points if point_df.loc[point, 'unvisited']}
                
                # Add neighbors to the current cluster candidates
                cluster_points.update(add_points)
                
            # If no cluster candidates remain, then
            # break the loop and proceed to the next
            # cluster
            if len(cluster_points) == 0:
                break
        
        # Incremen cluster ID
        current_cluster+=1
        
    else:
        
        # If selected point does not have sufficient
        # neighborhood then mark it as visited and continue
        point_df.loc[curr_point, 'unvisited'] = False

In [None]:
# Find all points without assigned cluster
remaining_points = point_df[point_df['cluster'].isna()].index.to_list()
core_points = point_df[point_df['core'] == True].index.to_list()

# Iterate through all the points
for curr_point in remaining_points:
    
    # Find all neighbors within eps radius
    neighbours = nearest_neigh.radius_neighbors(
        np.expand_dims(X[curr_point], axis=0), 
        radius=eps, 
        return_distance=False
    )[0]
    
    # If data point has no neighbors, declare it as noise.
    if neighbours.shape[0] == 1:
        point_df.loc[curr_point, 'cluster'] = -1
        
    # Otherwise, check if there are core data points
    # in the current data point neighbourhood
    else:
        for neighbour in neighbours:
            
            if neighbour in core_points:
                point_df.loc[curr_point, 'cluster'] = point_df.loc[neighbour, 'cluster']
                point_df.loc[curr_point, 'border'] = True

                break
            
        if point_df.loc[curr_point, 'cluster'] is None:
            
            point_df.loc[curr_point, 'cluster'] = -1
        

In [None]:
# Plot clustering
sns.scatterplot(X_df, x='dim1', y='dim2', hue=point_df['cluster'].to_numpy())