# Clustering & PCA

---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from matplotlib import rcParams
rcParams['font.family'] = 'New Gulim'
rcParams['font.size'] = 20
rcParams['axes.unicode_minus'] = False

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from scipy.cluster.hierarchy import dendrogram, ward
from sklearn.cluster import DBSCAN

from sklearn.metrics.cluster import silhouette_score

from sklearn.decomposition import PCA

# 1. 보성군 날씨

#### 데이터 로드

In [None]:
# 보성군 날씨
df = pd.read_csv('data/bosung_weather.csv', encoding='cp949', parse_dates=['시간'])
df.set_index('시간', inplace=True)
df

In [None]:
# 2개 속성 선택, 결측치 제거, 200개 샘플링
df = df[['기온(°C)', '풍속(m/s)']].dropna().sample(n=200)
df

### 1.1 K-Means Clustering

In [None]:
# 모델 생성
km = KMeans(n_clusters=4, random_state=123)

In [None]:
# 군집 분류
labels_km = km.fit_predict(df)

In [None]:
labels_km

In [None]:
print("군집의 크기: {}".format(np.bincount(labels_km)))

In [None]:
# 2차원 시각화
fig = plt.figure(figsize=(12,8))
plt.scatter(df.iloc[:, 0], df.iloc[:, 1], c=labels_km, s=100, edgecolors='black')

plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=200,
            marker='^', c=range(km.n_clusters), linewidth=2, edgecolors='black')

plt.xlabel("기온")
plt.ylabel("풍속")
plt.show()

### 1.2 병합 군집 ( agglomerative clustering )

In [None]:
# 모델 생성
agg = AgglomerativeClustering(n_clusters=4)

In [None]:
# 군집 분류
labels_agg = agg.fit_predict(df)

In [None]:
labels_agg

In [None]:
print("군집의 크기: {}".format(np.bincount(labels_agg)))

In [None]:
# 2차원 시각화
fig = plt.figure(figsize=(12,8))
plt.scatter(df.iloc[:, 0], df.iloc[:, 1], c=labels_agg, s=100, edgecolors='black')

plt.xlabel("기온")
plt.ylabel("풍속")

plt.show()

#### 1.2.1 Dendrogram

In [None]:
# scipy의 계층 군집 유사도가 들어 있는 연결배열 반환
linkage_array = ward(df)

# 클러스터 사이의 거리가 담겨있는 linkage_array로 덴드로그램 시각화
# p값을 통해 최종 leaf 깊이 설정
plt.figure(figsize=(10,10))
dendrogram(linkage_array, p=3, truncate_mode='level', no_labels=True)

plt.xlabel("샘플 번호")
plt.ylabel("클러스터 거리")

ax = plt.gca()
bounds = ax.get_xbound()
ax.plot(bounds, [20, 20], '--', c='k')

### 1.3 DBSCAN

In [None]:
# 모델 생성
dbscan = DBSCAN(min_samples=7, eps=1)

In [None]:
# 군집 분류
labels_dbscan = dbscan.fit_predict(df)

In [None]:
labels_dbscan

In [None]:
# 2차원 시각화
fig = plt.figure(figsize=(12,8))
plt.scatter(df.iloc[:, 0], df.iloc[:, 1], c=labels_dbscan, s=100, edgecolors='black')

plt.xlabel("기온")
plt.ylabel("풍속")

### 1.4 군집 분석 평가 - 실루엣 점수

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(12, 3),subplot_kw={'xticks': (), 'yticks': ()})

rcParams['font.size'] = 15

axes[0].scatter(df.iloc[:, 0], df.iloc[:, 1], c=labels_km, s=60, edgecolors='black')
axes[0].set_title("{} : {:.2f}".format(km.__class__.__name__, silhouette_score(df, labels_km)))

axes[1].scatter(df.iloc[:, 0], df.iloc[:, 1], c=labels_agg, s=60, edgecolors='black')
axes[1].set_title("{} : {:.2f}".format(agg.__class__.__name__, silhouette_score(df, labels_agg)))

axes[2].scatter(df.iloc[:, 0], df.iloc[:, 1], c=labels_dbscan, s=60, edgecolors='black')
axes[2].set_title("{} : {:.2f}".format(dbscan.__class__.__name__, silhouette_score(df, labels_dbscan)))

plt.show()

### 1.5 PCA - 주성분 분석

In [None]:
df = pd.read_csv('data/bosung_weather.csv', encoding='cp949', parse_dates=['시간'])
df.set_index('시간', inplace=True)
df

In [None]:
# 결측 제거 및 200개 샘플링
df = df.dropna().sample(n=200)

In [None]:
# PCA 모델 생성
pca = PCA(n_components=2)

In [None]:
# PCA Transform
df_pca = pca.fit_transform(df)
df_pca

In [None]:
# K-Means clustering
km = KMeans(n_clusters=4, random_state=123)
labels_km = km.fit_predict(df_pca)

In [None]:
# 병합 군집 (agglomerative clustering)
agg = AgglomerativeClustering(n_clusters=4)
labels_agg = agg.fit_predict(df_pca)

In [None]:
# DBSCAN
dbscan = DBSCAN(min_samples=7, eps=3)
labels_dbscan = dbscan.fit_predict(df_pca)

In [None]:
# 군집별 실루엣 점수 평가

fig, axes = plt.subplots(1, 3, figsize=(12, 3),subplot_kw={'xticks': (), 'yticks': ()})

rcParams['font.size'] = 15

axes[0].scatter(df_pca[:, 0], df_pca[:, 1], c=labels_km, s=60, edgecolors='black')
axes[0].set_title("{} : {:.2f}".format(km.__class__.__name__, silhouette_score(df, labels_km)))

axes[1].scatter(df_pca[:, 0], df_pca[:, 1], c=labels_agg, s=60, edgecolors='black')
axes[1].set_title("{} : {:.2f}".format(agg.__class__.__name__, silhouette_score(df, labels_agg)))

axes[2].scatter(df_pca[:, 0], df_pca[:, 1], c=labels_dbscan, s=60, edgecolors='black')
axes[2].set_title("{} : {:.2f}".format(dbscan.__class__.__name__, silhouette_score(df, labels_dbscan)))

plt.show()


---

# 2. 유럽 국가별 단백질 섭취원 비율

#### 데이터 로드

In [None]:
# 유럽국가 단백질 섭취원
df_p = pd.read_csv('data/protein.csv')
df_p.set_index('Country', inplace=True)
df_p

In [None]:
# 모델 생성
km_p = KMeans(n_clusters=5, random_state=123)

In [None]:
# 군집 분류
labels_km_p = km_p.fit_predict(df_p)

In [None]:
labels_km_p

In [None]:
df_p['gpnum'] = labels_km_p
df_p.sort_values(by='gpnum')

---

In [None]:
# end of file