In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix

import tensorflow as tf
from keras.models import Sequential, Model, save_model, load_model
from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Input, Flatten
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras.utils import plot_model, set_random_seed

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.datasets import load_iris
from sklearn.cluster import DBSCAN

plt.rcParams['font.family'] = 'Malgun Gothic'

In [6]:
np.random.seed(42)
tf.random.set_seed(42)
set_random_seed(42)

# 비원형 데이터의 군집화 - DBSCAN
- 분할적 군집화 방식
- 밀도(데이터의 밀집)기반 군집화 ==> 미리 군집 수를 지정할 필요 없음.
- 다양한 형태의 데이터에서 군집화 가능함.
- 이상치 데이터 제거도 가능함.
- 군집을 정하는 기준
    - 임의의 점(Point)에서 지정된 거리만큼 영역안에 지정된 데이터 수 존재 여부
    - 하이퍼파라미터 => 거리, 데이터 수

## [1] 데이터 로딩

In [14]:
iris=load_iris()

In [34]:
data = iris.data
target = iris.target
data.shape, type(data), target.shape, type(target), iris.feature_names, iris.target_names

((150, 4),
 numpy.ndarray,
 (150,),
 numpy.ndarray,
 ['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'],
 array(['setosa', 'versicolor', 'virginica'], dtype='<U10'))

In [35]:
## DataFrame 형태로 데이터 로딩
X, y = load_iris(return_X_y=True, as_frame=True)
type(X), type(y)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

## [2] 데이터 전처리

In [37]:
# 스케일링
scaler=StandardScaler()
X_scaled = scaler.fit_transform(X)
X.shape, type(X), y.shape, type(y)

((150, 4), pandas.core.frame.DataFrame, (150,), pandas.core.series.Series)

## [3] 군집화

In [72]:
# 군집화 객체 생성
dbscan=DBSCAN(eps=0.6, min_samples=10, metric='euclidean')

In [73]:
# iris 데이터를 분류
irisDBS=dbscan.fit(X_scaled)

In [80]:
dbscan_labels=irisDBS.fit_predict(X)
dbscan_labels

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1, -1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
      dtype=int64)

In [75]:
irisDBS.labels_

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1, -1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
      dtype=int64)

In [76]:
irisDBS.components_.shape

(112, 4)

In [77]:
## 정답지랑 하나로 묶기
irisDF=pd.DataFrame(X_scaled)

In [78]:
irisDF['dbscan_cluster']=dbscan_labels
irisDF['target']=y

In [79]:
irisDF_result=irisDF.groupby(['target'])['dbscan_cluster'].value_counts()
irisDF_result

target  dbscan_cluster
0        0                49
        -1                 1
1        1                46
        -1                 4
2        1                42
        -1                 8
Name: dbscan_cluster, dtype: int64