In [1]:
%run ../common-imports.ipynb

# DBSCAN common utilities

In [2]:
from matplotlib.axes import Axes

def dbscan_cluster_plot(eps: float, nPts:int,  data:pd.DataFrame, ax:Axes) -> DBSCAN:
    data['x1'] = data.iloc[:,0]
    data['x2'] = data.iloc[:,1]
    X = data[['x1', 'x2']]
    clusterer = DBSCAN(eps=eps, min_samples=nPts)
    ŷ = clusterer.fit_predict(X)
    data['yhat'] = ŷ

    unique_labels= data.yhat.unique()
    n_clusters = len(np.unique(clusterer.labels_)) - (1 if -1 in clusterer.labels_ else 0)

    # First, separate the clustered-points from outliers
    clusters = data[data.yhat != -1]
    outliers = data[data.yhat == -1]

    colors = [plt.cm.Spectral(each) for each in np.linspace(0,1, len(unique_labels))]

    # plot clusters
    ax.scatter(clusters.x1, 
                clusters.x2, 
                c=clusters.yhat,
                s=150,
                cmap='Spectral',
                alpha=0.5, edgecolor='black');

    # plot outliers as dark black points
    ax.scatter(outliers.x1, 
                outliers.x2, 
                c='black',
                s=50,
                cmap='Paired',
                alpha=0.3);
    ax.set_title(f'eps:{eps}, nPts: {nPts}, clusters: {n_clusters}')
    return clusterer

In [10]:
from typing import List
def dbscan_cluster(epsilons: List[float], neighbors:List[int], data:pd.DataFrame) -> pd.DataFrame:
    
    is_labeled = 'label' in data.columns

    columns = ['epsilon',                       
               'nPts',
               'clusters',
               'silhouette score']
    
    if is_labeled:
        columns.extend(['homogeniety', 
                        'completeness',
                        'v-measure',
                        'adjusted rand index',
                        'adjusted mutual information'])
        
    
    # Create an empty dataframe to store the clustering quality metrics
    quality = pd.DataFrame(columns=columns)
    
    # Subplots
    row_count = len(neighbors)
    col_count = len(epsilons)
    fig, axes = plt.subplots(row_count, col_count, figsize=(8*row_count,5*col_count))

    # Cluster for each combination of the hyper-parameters
    for row, nPts in enumerate(neighbors):
        for col, ϵ in enumerate(epsilons):
            clusterer = dbscan_cluster_plot(ϵ, nPts, data, axes[row][col])
            n_clusters = len(np.unique(clusterer.labels_)) - (1 if -1 in clusterer.labels_ else 0)
            silhouette = 0 if len(np.unique(clusterer.labels_)) == 1 else metrics.silhouette_score(data, clusterer.labels_)
            values = [ϵ, nPts, n_clusters, silhouette]
            
            if is_labeled:
                y = data.label
                ŷ = clusterer.labels_
                values.extend ([metrics.homogeneity_score(y, ŷ),
                                metrics.completeness_score(y, ŷ),
                                metrics.v_measure_score(y, ŷ),
                                metrics.adjusted_rand_score(y, ŷ),
                                metrics.adjusted_mutual_info_score(y, ŷ),])
            
            quality.loc[len(quality.index)] = values
            
    plt.suptitle(r'\textbf{\Huge Sensitivity of DBSCAN to choices of the hyperparameters}');
    plt.tight_layout()
    return quality