In [27]:
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_samples, silhouette_score

In [28]:
def visualize_km_silhouette(cluster_lists, X_features): 
    
    from sklearn.datasets import make_blobs
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_samples, silhouette_score

    import matplotlib.pyplot as plt
    import matplotlib.cm as cm
    import math
    
    # 입력값으로 클러스터링 갯수들을 리스트로 받아서, 각 갯수별로 클러스터링을 적용하고 실루엣 개수를 구함
    n_cols = len(cluster_lists)
    
    # plt.subplots()으로 리스트에 기재된 클러스터링 수만큼의 sub figures를 가지는 axs 생성 
    fig, axs = plt.subplots(figsize=(4*n_cols, 4), nrows=1, ncols=n_cols)
    
    # 리스트에 기재된 클러스터링 갯수들을 차례로 iteration 수행하면서 실루엣 개수 시각화
    # cluster_lists에 군집의 갯수들이 저장되어 있음
    for ind, n_cluster in enumerate(cluster_lists):
        
        # KMeans 클러스터링 수행하고, 실루엣 스코어와 개별 데이터의 실루엣 값 계산. 
        clusterer = KMeans(n_clusters = n_cluster, max_iter=500, random_state=0)
        cluster_labels = clusterer.fit_predict(X_features)
        
        sil_avg = silhouette_score(X_features, cluster_labels) # 전체 데이터의 실루엣 계수의 평균
        sil_values = silhouette_samples(X_features, cluster_labels) # 개별 데이터의 실루엣 계쑤
        
        # 그래프 그리는 코드
        y_lower = 10
        axs[ind].set_title('Number of Cluster : '+ str(n_cluster)+'\n' \
                          'Silhouette Score :' + str(round(sil_avg,3)) )
        axs[ind].set_xlabel("The silhouette coefficient values")
        axs[ind].set_ylabel("Cluster label")
        axs[ind].set_xlim([-0.1, 1])
        axs[ind].set_ylim([0, len(X_features) + (n_cluster + 1) * 10])
        axs[ind].set_yticks([])  # Clear the yaxis labels / ticks
        axs[ind].set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])
        
        # 클러스터링 갯수별로 fill_betweenx( )형태의 막대 그래프 표현. 
        for i in range(n_cluster):
            ith_cluster_sil_values = sil_values[cluster_labels==i]
            ith_cluster_sil_values.sort()
            
            size_cluster_i = ith_cluster_sil_values.shape[0]
            y_upper = y_lower + size_cluster_i
            
            color = cm.nipy_spectral(float(i) / n_cluster)
            axs[ind].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_sil_values, \
                                facecolor=color, edgecolor=color, alpha=0.7)
            axs[ind].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            y_lower = y_upper + 10
            
        axs[ind].axvline(x=sil_avg, color="red", linestyle="--")

In [29]:
from sklearn.preprocessing import MinMaxScaler
minmaxscaler = MinMaxScaler()

In [4]:
dog = pd.read_csv('../data/정리/강아지(221028).csv')

In [5]:
dog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4183 entries, 0 to 4182
Data columns (total 78 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   level_0        4183 non-null   int64  
 1   index          4183 non-null   int64  
 2   유기번호           4183 non-null   int64  
 3   썸네일            4183 non-null   object 
 4   접수일            4183 non-null   int64  
 5   발견장소           4183 non-null   object 
 6   품종_x           4183 non-null   object 
 7   색상_x           4183 non-null   object 
 8   나이             4183 non-null   object 
 9   체중_x           4183 non-null   object 
 10  공고번호           4183 non-null   object 
 11  공고시작일          4183 non-null   int64  
 12  공고종료일          4183 non-null   int64  
 13  이미지            4183 non-null   object 
 14  성별             4183 non-null   object 
 15  중성화여부          4183 non-null   object 
 16  특징             4183 non-null   object 
 17  보호소이름          4183 non-null   object 
 18  보호소전화번호 

In [6]:
feature = dog[['체중_숫자']]

kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, random_state=0).fit(feature)

dog['체중_군집'] = kmeans.labels_

In [7]:
dog.groupby('체중_군집')['체중_숫자'].mean()

체중_군집
0    11.854529
1     3.472964
2    22.699612
Name: 체중_숫자, dtype: float64

In [8]:
dict(dog.groupby('체중_군집')['체중_숫자'].mean().argsort())

{0: 1, 1: 0, 2: 2}

In [9]:
dog=dog.replace({'체중_군집':dict(dog.groupby('체중_군집')['체중_숫자'].mean().argsort())})

In [10]:
dog['체중_군집'].value_counts()

0    2802
1    1020
2     361
Name: 체중_군집, dtype: int64

In [11]:
feature = dog[['나이_주환산']]

kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, random_state=0).fit(feature)

dog['나이_군집'] = kmeans.labels_

In [12]:
dog.groupby('나이_군집')['나이_주환산'].mean()

나이_군집
0     26.738570
1    156.931034
2    408.460967
Name: 나이_주환산, dtype: float64

In [13]:
dog=dog.replace({'나이_군집':dict(dog.groupby('나이_군집')['나이_주환산'].mean().argsort())})

In [14]:
dog['나이_군집'].value_counts()

0    2406
1    1508
2     269
Name: 나이_군집, dtype: int64

In [15]:
feature = dog[['친화성_score']]

kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, random_state=0).fit(feature)

dog['친화성_군집'] = kmeans.labels_

In [16]:
dog.groupby('친화성_군집')['친화성_score'].mean()

친화성_군집
0   -1.296131
1    2.611991
2    0.186905
Name: 친화성_score, dtype: float64

In [17]:
dog['친화성_군집'].value_counts()

2    2627
1     884
0     672
Name: 친화성_군집, dtype: int64

In [18]:
dog=dog.replace({'친화성_군집':dict(dog.groupby('친화성_군집')['친화성_score'].mean().argsort())})

In [19]:
feature = dog[['api_건강점수']]

kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=300, random_state=0).fit(feature)

dog['건강상태_군집'] = kmeans.labels_

In [20]:
dog.groupby('건강상태_군집')['api_건강점수'].mean()

건강상태_군집
0    7.000
1    5.000
2    6.000
3    4.000
4    2.625
Name: api_건강점수, dtype: float64

In [21]:
dog=dog.replace({'건강상태_군집':dict(dog.groupby('건강상태_군집')['api_건강점수'].mean().argsort())})

In [22]:
dog['건강상태_군집'].value_counts()

4    3562
1     353
3     188
2      56
0      24
Name: 건강상태_군집, dtype: int64

In [23]:
dog['성견'] = 0

In [24]:
dog.loc[dog['만나이']>=1, '성견'] = 1

In [105]:
feature = dog[['성견', '나이_주환산', '체중_숫자']]

minmaxscaler.fit(feature)
dog_minmax_scaled = minmaxscaler.transform(feature)

kmeans = KMeans(n_clusters=9, init='k-means++', max_iter=300, random_state=0).fit(dog_minmax_scaled)

dog['체중+나이_군집'] = kmeans.labels_

In [107]:
dog.groupby('체중+나이_군집')[['체중_숫자','만나이']].mean()

Unnamed: 0_level_0,체중_숫자,만나이
체중+나이_군집,Unnamed: 1_level_1,Unnamed: 2_level_1
0,24.716336,2.926724
1,2.356904,0.0
2,6.038162,1.503115
3,6.153984,9.455285
4,5.706977,5.705426
5,8.9076,0.0
6,15.321423,3.76779
7,6.529683,3.292683
8,14.694862,1.541436


In [None]:
feature = dog[['만나이', '체중_숫자', '친화성_score', 'api_건강점수']]

minmaxscaler.fit(feature)
dog_minmax_scaled = minmaxscaler.transform(feature)

kmeans = KMeans(n_clusters=12, init='k-means++', max_iter=1000, random_state=0).fit(dog_minmax_scaled)

dog['4개피쳐 군집'] = kmeans.labels_

In [None]:
dog.to_csv('../군집/군집결과/결과1.csv', index=False)

In [None]:
dog.groupby('4개피쳐 군집')[['만나이', '체중_숫자', '친화성_score', 'api_건강점수']].mean()

In [101]:
from sklearn.cluster import DBSCAN

feature = dog[['나이_주환산', '체중_숫자', '친화성_score', 'api_건강점수']]

dog_minmax_scaled = minmaxscaler.fit_transform(feature)

dbscan = DBSCAN(eps=0.09095, min_samples=10)
dbscan_labels = dbscan.fit_predict(dog_minmax_scaled)

dog['db스캔 라벨'] = dbscan_labels

In [102]:
dog.groupby('db스캔 라벨')[['나이_주환산', '체중_숫자', '친화성_score', 'api_건강점수']].mean()

Unnamed: 0_level_0,나이_주환산,체중_숫자,친화성_score,api_건강점수
db스캔 라벨,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,244.579819,12.676657,0.801205,5.554217
0,84.297288,6.838555,0.444155,7.0
1,97.55,5.856923,0.3,6.0
2,82.763348,4.691111,0.0,5.0
3,50.272727,3.168182,2.0,5.0
4,178.285714,13.857143,4.0,7.0
5,26.625,2.721667,0.0,4.0
6,442.0,3.833333,0.0,6.0


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)