# 1. dataset loading

In [1]:
# wine dataset을 사용

from sklearn.datasets import load_wine

wine_data = load_wine()
# wine_data는 data(features : X)와 target(label : y)으로 구성
X = wine_data.data
y = wine_data.target

print(X.shape)
print(y.shape)

(178, 13)
(178,)


# 2. dbscan with numpy

In [5]:
import numpy as np

# 거리계산 함수
def euclidean_distance (x1 , x2):
    return np.sqrt(np.sum(x1-x2)**2)

# X의 점들 중에서 point와의 거리가 eps 이내인 점들을 찾아서 리턴
def get_neighbors (X , point , eps):
    neighbors = []

    for i , candidate in enumerate(X):
        if (euclidean_distance(point , candidate) <= eps):
            neighbors.append(i)
    return neighbors

def trace_border(X , labels , point , neighbors , n_clusters ,eps , min_points):
    i = 0
    while i < len(neighbors):
        neighbor_point = neighbors[i]
        if (labels[neighbor_point]==-1):
            labels[neighbor_point] = n_clusters
            new_neighbors = get_neighbors(X , X[neighbor_point] , eps)
            if (len(new_neighbors) >= min_points):
                neighbors += new_neighbors
        i += 1

# dbscan은 cluster의 수가 미정
# dbscan은 cluster의 수와 label을 리턴
def dbscan (X , eps , min_points):
    m , n = X.shape
    labels = np.full(m,-1)
    n_clusters = 0

    for i , point in enumerate(X):
        if (labels[i] >= 0): # noise가 아니고 labeling이 됨
            continue

        # core point인지 아닌지 판단 --> point의 neighbor를 계산해서 , 그 갯수가 num_point보다 크면 core point
        # neighbor --> point와의 거리가 eps 이내인 점
        neighbors = get_neighbors (X , point , eps)

        if (len(neighbors) < min_points):   # Not core point --> continue
            continue

        n_clusters += 1
        labels[i] = n_clusters

        trace_border (X , labels , point , neighbors , n_clusters , eps , min_points)

    return n_clusters , labels

In [18]:
eps = 40
min_points = 3

n_labels , labels = dbscan(X , eps , min_points)

unique_labels , label_count = np.unique(labels , return_counts=True)
print(n_labels)
print(len(unique_labels))
print(label_count)

4
5
[  2  28   5 130  13]


# 3. dbscan with sklearn

In [19]:
from sklearn.cluster import DBSCAN

dbscansk = DBSCAN(eps=40 , min_samples=3)
dbscansk.fit(X)

unique_labels , label_count = np.unique(dbscansk.labels_ , return_counts=True)
print(len(unique_labels))
print(label_count)

5
[  3  27   5 130  13]
