# <span style="color:#0073e6">0. 사례준비</span>
<style>
@media print
{
h1 {page-break-before:always}
}
</style>

> **Libaray & Environment Settings** 

In [None]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.rc('font', family='malgun gothic')
plt.rc('axes', unicode_minus=False)
np.random.seed(123)
%matplotlib inline

import time

> **MNIST DATA SET LOADING**

In [None]:
with open('data/mnist.pkl', 'rb') as f:
    _, _, t = pickle.load(f, encoding='latin1')
x, y = t[0], t[1]
del t
print(x.shape)
print(y.shape)

# <span style="color:#0073e6">1. 대표적인 군집화 성능 평가지표<span>
### 1.1. 사전에 정의된 그룹이 있는 경우

In [None]:
from sklearn.metrics import homogeneity_score
from sklearn.metrics import completeness_score
from sklearn.metrics import v_measure_score
from sklearn.metrics import silhouette_score
c = [0,0,0,1,1,1]
t = [1,1,1,2,2,2]
print(homogeneity_score(t,c))
print(completeness_score(t,c))
print(v_measure_score(t,c))

### 1.2. Silhouette Coefficient


# <span style="color:#0073e6">2. Partitioning methods<span>
### 2.1. K-means

In [None]:
#from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=10)
kmeans.fit(x)

> **클러스터링 성능**
>> 사전 정의 군집 재현 정도


In [None]:
pred = kmeans.predict(x)
print(' homogeneity: %.4f' % homogeneity_score(y,pred))
print('completeness: %.4f' % completeness_score(y,pred))
print('   v-measure: %.4f' % v_measure_score(y,pred))

>> Silhouette Coefficient

In [None]:
print('Silhouette Coefficient : %.4f' % silhouette_score(x, pred))

> **차원축소기법을 사용한 클러스터링 성능 개선**

In [None]:
from sklearn.decomposition import IncrementalPCA
pca = IncrementalPCA(n_components=784)
pca.fit(x)
pca_x = pca.transform(x)

kmeans = MiniBatchKMeans(n_clusters=10)
kmeans.fit(pca_x)
pred = kmeans.predict(pca_x)
print(v_measure_score(y,pred), silhouette_score(x, pred))

> **군집개수(k)의 결정**

In [None]:
results = []
for k in range(2, 51):
    kmeans = MiniBatchKMeans(n_clusters=k)
    kmeans.fit(pca_x)
    pred = kmeans.predict(pca_x)
    results.append([kmeans.inertia_, v_measure_score(y,pred), silhouette_score(x, pred)])
results = pd.DataFrame(results).reset_index()
results.columns = ['k','Inertia','V-measure','Silhouette']
results['k'] += 2
results.head()

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(10,10))
sns.lineplot(x='k', y='Inertia', data=results, ax=ax[0])
ax[0].set_xticks(results['k'])
sns.lineplot(x='k', y='V-measure', data=results, ax=ax[1])
ax[1].set_xticks(results['k'])
sns.lineplot(x='k', y='Silhouette', data=results, ax=ax[2])
ax[2].set_xticks(results['k']);

> **군집화 모델링 함수화**

In [None]:
def cluster_scores(model, x, y):
    start_time = time.time()
    model.fit(x)
    pred = model.predict(x)
    end_time = time.time() - start_time
    v = v_measure_score(y, pred)
    s = silhouette_score(x, pred)
    return v, s, end_time

In [None]:
results = pd.DataFrame(columns=['v_measure', 'silhouette','time'])
k = 12
kmeans = MiniBatchKMeans(n_clusters=k)
scores = cluster_scores(kmeans, pca_x, y)
results.loc['kmeans'] = scores
results

### 2.2. Mean Shift

In [None]:
from sklearn.cluster import MeanShift
MS = MeanShift(bandwidth=2, n_jobs=-1)
scores = cluster_scores(MS, pca_x, y)
results.loc['MeanShift'] = scores
results

> **도출된 군집 개수 확인**

In [None]:
MS.cluster_centers_.shape[0]

# <span style="color:#0073e6">3. Hierarchical Methods<span>
### 3.1. AgglomerativeClustering

In [None]:
def cluster_scores2(model, x, y):
    start_time = time.time()
    pred = model.fit_predict(x)
    end_time = time.time() - start_time
    v = v_measure_score(y, pred)
    s = silhouette_score(x, pred)
    return v, s, end_time

In [None]:
from sklearn.cluster import AgglomerativeClustering
AC = AgglomerativeClustering(n_clusters=10)
scores = cluster_scores2(AC, pca_x, y)
results.loc['AGNES'] = scores
results

> **덴드로그램 시각화 함수**

In [None]:
from scipy.cluster.hierarchy import dendrogram
def plot_dendrogram(model, **kwargs):
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    return dendrogram(linkage_matrix, **kwargs)

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
iris = iris.data
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
model = model.fit(iris)

plot_dendrogram(model, truncate_mode = 'level', p=3);

### 3.2. BIRCH

In [None]:
from sklearn.cluster import Birch
BIRCH = Birch(n_clusters=10)
scores = cluster_scores(BIRCH, pca_x, y)
results.loc['Birch'] = scores
results

# <span style="color:#0073e6">4. Density-based Methods<span>
### 4.1. DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=2)
scores = cluster_scores2(dbscan, pca_x, y)
results.loc['DBSCAN'] = scores
results

> **군집 개수 확인**

In [None]:
dbscan.core_sample_indices_.shape

### 4.2. OPTICS

In [None]:
from sklearn.cluster import OPTICS
optics = OPTICS(max_eps=3)
scores = cluster_scores2(optics, pca_x, y)
results.loc['OPTICS'] = scores
results

> **군집 개수 확인**

In [None]:
optics.cluster_hierarchy_.shape[0]