In [1]:
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons

In [2]:
import pandas as pd
import numpy as np

In [5]:
import matplotlib.pyplot as plt

In [3]:
X, y = make_moons(n_samples=1000, noise=0.05)

In [4]:
dbscan = DBSCAN(eps=0.05, min_samples=5)

In [9]:
dbscan.fit(X)

DBSCAN(eps=0.05)

In [10]:
## getting the labels of the training set
dbscan.labels_

array([ 0,  1, -1,  2,  0, -1,  2,  3,  4,  2,  5, -1,  6,  0,  1,  7,  7,
        3,  2,  2,  3,  2,  2,  2,  3,  7,  3,  1,  3, -1,  1,  3,  2,  2,
        7,  0,  2,  2,  3,  3,  1,  1,  1,  2,  3,  8,  7, -1,  3,  1,  0,
        1,  3, -1,  9,  2,  0,  5,  7,  5,  4,  3,  2,  9,  7,  3,  0,  6,
        9,  7,  2,  7,  1,  2,  6,  3,  1,  2, -1,  6,  0,  3,  7,  6,  1,
        5,  2,  1,  1,  0,  7,  4,  2,  0,  3,  3,  3,  0,  1, -1,  2,  1,
        1,  3,  2,  3, -1,  1,  7,  3, -1,  8,  6,  5,  6,  3,  3,  1,  3,
        7,  5,  0,  2,  4,  9,  4,  9,  1,  5,  2,  6,  2,  2,  2,  2,  2,
        1,  7,  2,  6, -1,  0,  0,  1,  6,  2,  2,  1,  8,  7,  5,  2,  3,
        3,  0,  2,  6,  7,  5,  3,  0,  2,  3,  7,  3,  7,  7,  2,  9,  3,
        0,  7,  3,  3,  3,  7,  3,  3,  0,  3,  2,  3,  6,  2,  9,  5,  5,
        1, -1,  7,  5,  2,  3,  3,  9,  1,  7,  9,  7,  0,  7,  5,  6,  1,
        3,  1,  6,  9,  1,  4,  0,  2,  1,  1,  2,  2,  3,  3,  4,  8,  6,
        5,  3,  2,  0,  1

**Here the -1 indicates the anomalies or the outliers**

In [11]:
## let's get the indices of the core instances
dbscan.core_sample_indices_

array([  0,   1,   3,   4,   6,   7,   8,   9,  10,  12,  13,  14,  15,
        16,  17,  18,  19,  20,  21,  22,  23,  25,  26,  27,  28,  30,
        31,  32,  34,  35,  36,  37,  38,  39,  40,  41,  43,  44,  45,
        46,  50,  51,  52,  54,  55,  57,  58,  59,  60,  61,  62,  63,
        64,  65,  66,  67,  68,  69,  70,  72,  73,  74,  75,  76,  77,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  95,  96,  97,  98, 101, 102, 104, 105, 107, 108, 109,
       114, 115, 117, 118, 119, 120, 122, 123, 124, 126, 127, 128, 130,
       131, 132, 133, 134, 135, 136, 138, 139, 141, 142, 143, 144, 145,
       147, 148, 149, 150, 151, 152, 153, 154, 155, 157, 158, 159, 160,
       161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174,
       175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187,
       189, 190, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203,
       204, 205, 206, 207, 208, 209, 211, 213, 214, 215, 216, 21

In [12]:
## getting the core samples 
dbscan.components_

array([[ 0.72584415, -0.51671362],
       [ 0.70421973,  0.80106274],
       [ 1.6413915 , -0.22980397],
       ...,
       [ 0.84729259,  0.51388501],
       [ 0.80515271,  0.50500482],
       [ 1.89761878,  0.06073098]])

**Somewhat surprisingly, the DBSCAN class does not have a predict() method, although it has a fit_predict() method. In other words, it cannot predict which cluster a new instance belongs to. So in order to predict the label of the new instance, we need to convert this clustering problem into the classification problem.**

In [13]:
from sklearn.neighbors import KNeighborsClassifier

In [14]:
knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(dbscan.components_, dbscan.labels_[dbscan.core_sample_indices_])

KNeighborsClassifier(n_neighbors=50)

**In the above line of code, first term represents the core samples and the second term represent their labels obtained using the DBSCAN algorithm earlier. So basically we are training the classification model given the instances and their labels.**

In [16]:
## finding the cluster to which new instance belong to
X_new = np.array([[-0.5, 0],
                 [0, 0.5],
                 [1, -0.1],
                 [2,1]])

knn.predict(X_new)

array([3, 6, 1, 2], dtype=int64)

In [17]:
knn.predict_proba(X_new)

array([[0.  , 0.  , 0.  , 0.82, 0.  , 0.  , 0.18, 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  ],
       [0.32, 0.68, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 1.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ]])