In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA
from yellowbrick.cluster import SilhouetteVisualizer

import warnings

warnings.filterwarnings(action = 'ignore')

## 1. 데이터 입력

### 1.1 행정동, 밀도 데이터 로드 및 병합

In [55]:
tmp_df = pd.read_csv('../data/행정동_1029.csv')
pd.set_option('display.max_columns',35)
tmp_df.rename(columns={'Unnamed: 0':'index'},inplace=True)
tmp_df.set_index('index',inplace=True)

In [56]:
tmp_df.columns

Index(['SUBWAY_NUM', 'STARBUCKS_NUM', 'SPORT_NUM', 'SAFE_DLVR_NUM',
       'POLICE_NUM', 'PHARM_NUM', 'NOISE_VIBRATION_NUM', 'MID_SCH_NUM',
       'MC_NUM', 'LEISURE_NUM', 'KINDER_NUM', 'KIDS_NUM', 'HOSPITAL_NUM',
       'HIGH_SCH_NUM', 'GYM_NUM', 'GOLF_NUM', 'FIRE_NUM', 'ELE_SCH_NUM',
       'DPTM_NUM', 'CON_NUM', 'CHILD_MED_NUM', 'CCTV_NUM', 'CAR_SHR_NUM',
       'CAFE_NUM', 'BUS_NUM', 'BIKE_NUM', 'ANI_HSPT_NUM', 'ACADEMY_NUM', 'GU',
       'DONG', 'DONG_CODE', 'BUS_CNT'],
      dtype='object')

In [57]:
tmp_df = tmp_df[['GU','DONG','DONG_CODE','SUBWAY_NUM','STARBUCKS_NUM','SPORT_NUM','SAFE_DLVR_NUM','POLICE_NUM','PHARM_NUM','NOISE_VIBRATION_NUM','MID_SCH_NUM','MC_NUM','LEISURE_NUM','KINDER_NUM','KIDS_NUM','HOSPITAL_NUM','HIGH_SCH_NUM','GYM_NUM','GOLF_NUM','FIRE_NUM','ELE_SCH_NUM','DPTM_NUM','CON_NUM','CHILD_MED_NUM','CCTV_NUM','CAR_SHR_NUM','CAFE_NUM','BUS_NUM','BIKE_NUM','ANI_HSPT_NUM','ACADEMY_NUM','BUS_CNT']]
tmp_df

Unnamed: 0_level_0,GU,DONG,DONG_CODE,SUBWAY_NUM,STARBUCKS_NUM,SPORT_NUM,SAFE_DLVR_NUM,POLICE_NUM,PHARM_NUM,NOISE_VIBRATION_NUM,MID_SCH_NUM,MC_NUM,LEISURE_NUM,KINDER_NUM,KIDS_NUM,HOSPITAL_NUM,HIGH_SCH_NUM,GYM_NUM,GOLF_NUM,FIRE_NUM,ELE_SCH_NUM,DPTM_NUM,CON_NUM,CHILD_MED_NUM,CCTV_NUM,CAR_SHR_NUM,CAFE_NUM,BUS_NUM,BIKE_NUM,ANI_HSPT_NUM,ACADEMY_NUM,BUS_CNT
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
0,종로구,청운효자동,1111051500,0.333333,0.333333,0.0,1.000000,2.000000,4.333333,18816.000000,2.000000,0.0,1.000000,0.000000,0.000000,3.0,0.0,0.000000,1.0,2.000000,0.0,4.000000,7.333333,56.666667,1.000000,39.666667,11.666667,5.666667,2.000000,18.000000,11.666667,14.0
1,종로구,사직동,1111053000,1.333333,7.333333,0.0,0.000000,3.000000,17.333333,22848.000000,0.000000,0.0,1.000000,10.000000,0.000000,0.0,18.0,3.000000,0.0,2.000000,0.0,20.000000,10.333333,41.666667,5.000000,100.666667,18.666667,11.666667,1.000000,19.000000,74.666667,16.0
2,종로구,삼청동,1111054000,1.000000,1.000000,1.0,0.000000,1.000000,2.000000,14112.000000,1.000000,0.0,0.000000,10.000000,0.000000,1.0,3.0,0.000000,0.0,0.000000,0.0,2.000000,1.000000,22.000000,1.000000,52.000000,8.000000,7.000000,0.000000,2.000000,7.000000,11.0
3,종로구,부암동,1111055000,0.000000,0.000000,0.0,0.000000,1.000000,1.000000,6048.000000,1.000000,0.0,0.000000,0.000000,0.000000,1.0,2.0,2.000000,1.0,4.000000,0.0,8.000000,5.000000,41.000000,5.000000,30.000000,18.000000,5.000000,3.000000,24.000000,4.000000,9.0
4,종로구,평창동,1111056000,0.000000,1.000000,0.0,0.000000,1.000000,5.000000,4032.000000,0.000000,0.0,0.000000,0.000000,0.000000,1.0,5.0,5.000000,0.0,0.000000,0.0,6.000000,10.000000,21.000000,0.000000,36.000000,25.000000,7.000000,0.000000,28.000000,14.000000,37.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,강동구,성내2동,1174065000,0.333333,1.333333,0.0,0.666667,0.333333,12.666667,1442.333333,0.333333,0.0,1.666667,16.666667,0.666667,0.0,10.0,4.666667,1.0,1.333333,0.0,38.666667,4.000000,52.000000,6.666667,47.666667,11.000000,6.000000,2.666667,63.333333,59.333333,11.0
422,강동구,성내3동,1174066000,0.333333,1.333333,0.0,0.666667,0.333333,12.666667,1442.333333,0.333333,0.0,1.666667,16.666667,0.666667,0.0,10.0,4.666667,1.0,1.333333,0.0,38.666667,4.000000,52.000000,6.666667,47.666667,11.000000,6.000000,2.666667,63.333333,59.333333,9.0
423,강동구,길동,1174068500,1.000000,2.000000,0.0,1.000000,0.000000,35.000000,4327.000000,1.000000,0.0,3.000000,20.000000,1.000000,0.0,18.0,9.000000,1.0,6.000000,0.0,76.000000,10.000000,119.000000,10.000000,63.000000,22.000000,9.000000,8.000000,100.000000,120.000000,28.0
424,강동구,둔촌1동,1174069000,0.500000,0.000000,0.0,0.000000,1.000000,10.000000,2163.500000,1.500000,0.5,0.000000,0.000000,0.000000,1.0,4.0,3.000000,0.0,2.000000,0.0,17.000000,0.000000,32.500000,2.000000,17.500000,7.000000,5.500000,1.500000,38.000000,14.500000,4.0


In [58]:
density_df = pd.read_excel('../data/인구밀도.xlsx')
density_df

Unnamed: 0,GU,DONG,POP,AREA,DENSITY
0,종로구,사직동,9636,1.23,7834
1,종로구,삼청동,2739,1.49,1838
2,종로구,부암동,9782,2.27,4309
3,종로구,평창동,18329,8.87,2066
4,종로구,무악동,8297,0.36,23047
...,...,...,...,...,...
421,강동구,천호2동,33753,1.57,21499
422,강동구,길동,45973,2.17,21186
423,강동구,강일동,32982,2.26,14594
424,강동구,상일1동,38929,2.65,14690


### 분류 가중치 비율
- 교통
- 치안
- 건강 : 병원 : 약국 = 0.94 : 0.06
- 편의 :
- 교육 : 공교육(초,중,고) : 사교육(학원) = 1 : 0.7
- 육아

In [59]:
# 행정동 데이터, 밀도 데이터 병합
tmp = pd.merge(tmp_df, density_df)

# 컬럼 순서 정렬
tmp = tmp[['GU','DONG','DONG_CODE','POP','AREA','DENSITY',\
           'SUBWAY_NUM','BUS_CNT','BIKE_NUM','BUS_NUM',\# 교통
           'POLICE_NUM','FIRE_NUM','CCTV_NUM',\ # 치안
           'HOSPITAL_NUM','PHARM_NUM',\# 건강
           'DPTM_NUM','CON_NUM','CAFE_NUM',\# 편의
           'ELE_SCH_NUM','MID_SCH_NUM','HIGH_SCH_NUM','ACADEMY_NUM',\# 교육
           'KINDER_NUM','CHILD_MED_NUM',\# 육아
           'KIDS_NUM','STARBUCKS_NUM','MC_NUM','NOISE_VIBRATION_NUM','SAFE_DLVR_NUM','LEISURE_NUM','SPORT_NUM','GYM_NUM','GOLF_NUM','CAR_SHR_NUM','ANI_HSPT_NUM']]
tmp

Unnamed: 0,GU,DONG,DONG_CODE,POP,AREA,DENSITY,SUBWAY_NUM,BUS_CNT,BIKE_NUM,BUS_NUM,POLICE_NUM,FIRE_NUM,CCTV_NUM,SAFE_DLVR_NUM,HOSPITAL_NUM,PHARM_NUM,DPTM_NUM,CON_NUM,CAFE_NUM,ELE_SCH_NUM,MID_SCH_NUM,HIGH_SCH_NUM,ACADEMY_NUM,KINDER_NUM,CHILD_MED_NUM,KIDS_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,LEISURE_NUM,SPORT_NUM,GYM_NUM,GOLF_NUM,CAR_SHR_NUM,ANI_HSPT_NUM
0,종로구,청운효자동,1111051500,12177,2.57,4738,0.333333,14.0,2.000000,5.666667,2.000000,2.000000,1.000000,1.000000,3.0,4.333333,4.000000,7.333333,11.666667,0.0,2.000000,0.0,11.666667,0.000000,56.666667,0.000000,0.333333,0.0,18816.000000,1.000000,0.0,0.000000,1.0,39.666667,18.000000
1,종로구,사직동,1111053000,9636,1.23,7834,1.333333,16.0,1.000000,11.666667,3.000000,2.000000,5.000000,0.000000,0.0,17.333333,20.000000,10.333333,18.666667,0.0,0.000000,18.0,74.666667,10.000000,41.666667,0.000000,7.333333,0.0,22848.000000,1.000000,0.0,3.000000,0.0,100.666667,19.000000
2,종로구,삼청동,1111054000,2739,1.49,1838,1.000000,11.0,0.000000,7.000000,1.000000,0.000000,1.000000,0.000000,1.0,2.000000,2.000000,1.000000,8.000000,0.0,1.000000,3.0,7.000000,10.000000,22.000000,0.000000,1.000000,0.0,14112.000000,0.000000,1.0,0.000000,0.0,52.000000,2.000000
3,종로구,부암동,1111055000,9782,2.27,4309,0.000000,9.0,3.000000,5.000000,1.000000,4.000000,5.000000,0.000000,1.0,1.000000,8.000000,5.000000,18.000000,0.0,1.000000,2.0,4.000000,0.000000,41.000000,0.000000,0.000000,0.0,6048.000000,0.000000,0.0,2.000000,1.0,30.000000,24.000000
4,종로구,평창동,1111056000,18329,8.87,2066,0.000000,37.0,0.000000,7.000000,1.000000,0.000000,0.000000,0.000000,1.0,5.000000,6.000000,10.000000,25.000000,0.0,0.000000,5.0,14.000000,0.000000,21.000000,0.000000,1.000000,0.0,4032.000000,0.000000,0.0,5.000000,0.0,36.000000,28.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,강동구,성내2동,1174065000,24106,0.67,35979,0.333333,11.0,2.666667,6.000000,0.333333,1.333333,6.666667,0.666667,0.0,12.666667,38.666667,4.000000,11.000000,0.0,0.333333,10.0,59.333333,16.666667,52.000000,0.666667,1.333333,0.0,1442.333333,1.666667,0.0,4.666667,1.0,47.666667,63.333333
422,강동구,성내3동,1174066000,23346,0.71,32882,0.333333,9.0,2.666667,6.000000,0.333333,1.333333,6.666667,0.666667,0.0,12.666667,38.666667,4.000000,11.000000,0.0,0.333333,10.0,59.333333,16.666667,52.000000,0.666667,1.333333,0.0,1442.333333,1.666667,0.0,4.666667,1.0,47.666667,63.333333
423,강동구,길동,1174068500,45973,2.17,21186,1.000000,28.0,8.000000,9.000000,0.000000,6.000000,10.000000,1.000000,0.0,35.000000,76.000000,10.000000,22.000000,0.0,1.000000,18.0,120.000000,20.000000,119.000000,1.000000,2.000000,0.0,4327.000000,3.000000,0.0,9.000000,1.0,63.000000,100.000000
424,강동구,둔촌1동,1174069000,144,0.92,157,0.500000,4.0,1.500000,5.500000,1.000000,2.000000,2.000000,0.000000,1.0,10.000000,17.000000,0.000000,7.000000,0.0,1.500000,4.0,14.500000,0.000000,32.500000,0.000000,0.000000,0.5,2163.500000,0.000000,0.0,3.000000,0.0,17.500000,38.000000


In [136]:
val = tmp['SAFE_DLVR_NUM'].value_counts().index.tolist()
num = tmp['SAFE_DLVR_NUM'].value_counts().tolist()
s = 0
for i in range(len(val)):
    s += val[i] * num[i]
s

265.00000000000006

In [60]:
# 불필요 컬럼 제거
tmp = tmp.drop(['SPORT_NUM','FIRE_NUM','BUS_NUM'],axis=1)

In [61]:
tmp.describe()

Unnamed: 0,DONG_CODE,POP,AREA,DENSITY,SUBWAY_NUM,BUS_CNT,BIKE_NUM,POLICE_NUM,CCTV_NUM,SAFE_DLVR_NUM,HOSPITAL_NUM,PHARM_NUM,DPTM_NUM,CON_NUM,CAFE_NUM,ELE_SCH_NUM,MID_SCH_NUM,HIGH_SCH_NUM,ACADEMY_NUM,KINDER_NUM,CHILD_MED_NUM,KIDS_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,LEISURE_NUM,GYM_NUM,GOLF_NUM,CAR_SHR_NUM,ANI_HSPT_NUM
count,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0,426.0
mean,1143415000.0,22854.523474,1.420728,23150.119718,0.868545,15.647887,2.093897,0.565728,4.133803,0.622066,0.809859,12.169014,27.239437,14.323944,14.166667,0.06338,1.007042,7.448357,42.293427,21.57277,53.603286,0.518779,1.302817,0.204225,2452.551643,1.194836,3.626761,0.377934,37.985915,57.737089
std,19207780.0,9021.822196,1.571263,11326.683743,0.889106,9.95622,1.393441,0.437685,3.15675,0.503566,0.877893,7.266001,19.771517,14.060665,7.697127,0.207678,0.775321,7.034213,35.024099,18.688217,41.087587,0.656479,2.16424,0.363035,4534.820198,1.60698,4.318573,0.56879,37.562957,56.280168
min,1111052000.0,144.0,0.23,157.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.5,0.0,0.0,0.0,1.0,0.0,4.333333,0.0,0.0,0.0,125.75,0.0,0.0,0.0,2.0,1.0
25%,1126066000.0,16788.5,0.67,14512.0,0.270833,9.0,1.35,0.333333,2.285714,0.0,0.25,8.0,17.714286,4.38125,9.6,0.0,0.5,3.5,23.5,10.0,25.946429,0.0,0.333333,0.0,442.0375,0.0,1.333333,0.0,19.541667,27.090909
50%,1144064000.0,22141.5,0.97,23420.5,0.666667,14.0,2.0,0.5,3.5,0.666667,0.666667,11.0,22.5,10.333333,12.666667,0.0,1.0,6.0,34.5,20.0,41.375,0.333333,0.666667,0.0,1014.0,0.833333,2.5,0.2,27.363636,41.166667
75%,1161306000.0,27908.25,1.4875,31708.5,1.0,20.0,2.5,0.75,5.0,1.0,1.0,14.0,29.333333,18.0,16.6,0.0,1.4,9.0,48.0,30.0,65.73125,0.729167,1.333333,0.285714,2163.5,1.75,4.5,0.5,39.25,70.0
max,1174070000.0,56012.0,12.68,55657.0,6.0,79.0,12.0,3.5,24.0,3.5,6.5,50.0,170.0,101.0,62.0,1.5,6.0,60.0,255.0,170.0,251.0,4.25,18.5,4.0,56112.0,15.0,32.5,4.0,274.0,480.25


In [97]:
from scipy.stats import skew, kurtosis

In [105]:
columns = tmp.columns[6:]
skews = []
kurtosises = []
for column in columns:
    # 왜도 : 절댓값 3을 넘지 않아야 함
    skews.append(skew(tmp[column]))

    # 첨도 : 절댓값 8~10을 넘지 않아야 함
    kurtosises.append(kurtosis(tmp[column], fisher=True))
res = [skews,kurtosises]
standard_df = pd.DataFrame(data=res, index=['왜도','첨도'],columns=columns)

In [106]:
standard_df

Unnamed: 0,SUBWAY_NUM,BUS_CNT,BIKE_NUM,POLICE_NUM,CCTV_NUM,SAFE_DLVR_NUM,HOSPITAL_NUM,PHARM_NUM,DPTM_NUM,CON_NUM,CAFE_NUM,ELE_SCH_NUM,MID_SCH_NUM,HIGH_SCH_NUM,ACADEMY_NUM,KINDER_NUM,CHILD_MED_NUM,KIDS_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,LEISURE_NUM,GYM_NUM,GOLF_NUM,CAR_SHR_NUM,ANI_HSPT_NUM
왜도,2.116993,2.00405,2.109193,2.165971,2.53439,0.778953,2.893959,2.302759,3.402863,2.159069,1.986519,4.685652,1.825591,3.156503,3.367985,2.280446,1.825002,2.169966,4.282203,4.47708,5.679184,3.705893,3.555929,3.004027,3.557437,3.420124
첨도,6.570038,6.924614,9.16766,9.623087,9.854163,1.888575,13.2841,7.443356,17.314939,6.272822,7.69008,24.987152,7.176222,13.998211,14.449441,11.165904,4.083503,6.817194,23.438771,35.194645,50.380438,22.223878,16.007169,12.202719,15.602741,17.083714


In [124]:
over = []
for column in standard_df.columns:
    if((standard_df[column][0] > 3) and (standard_df[column][1] > 10)):
        over.append([column, standard_df[column][0], standard_df[column][1]])
over_std = pd.DataFrame(over).T

In [125]:
over_std.columns = over_std.iloc[0]
over_std = over_std.iloc[1:]
over_std

Unnamed: 0,DPTM_NUM,ELE_SCH_NUM,HIGH_SCH_NUM,ACADEMY_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,LEISURE_NUM,GYM_NUM,GOLF_NUM,CAR_SHR_NUM,ANI_HSPT_NUM
1,3.402863,4.685652,3.156503,3.367985,4.282203,4.47708,5.679184,3.705893,3.555929,3.004027,3.557437,3.420124
2,17.314939,24.987152,13.998211,14.449441,23.438771,35.194645,50.380438,22.223878,16.007169,12.202719,15.602741,17.083714


### 1.2 특성 간 결합 및 컬럼 정리

In [47]:
# 합하고자 하는 column 끼리 묶어서 list로 생성
sum_list = [['SUBWAY_NUM','BUS_CNT','BIKE_NUM'],
            ['POLICE_NUM','CCTV_NUM','SAFE_DLVR_NUM'],
            ['HOSPITAL_NUM','PHARM_NUM'],
            ['DPTM_NUM','CON_NUM','CAFE_NUM'],
            ['ELE_SCH_NUM','MID_SCH_NUM','HIGH_SCH_NUM','ACADEMY_NUM'],
            ['KINDER_NUM','CHILD_MED_NUM']]

# 묶어진 column 들의 각각의 column명 list
name_list = ['TRANSPORT','SECURITY','HEALTH','CONVINIENCE','EDUCATION','PARANTING']

In [48]:
def assembling(df,sum_list,name_list): # 큰 카테고리로 묶는 함수
    df_tmp = pd.DataFrame(data=range(len(df)))
    for j in range(len(sum_list)):
        x = df[sum_list[j]]
        k = []

        for i in range(len(df)):
            sum_k = sum(x.loc[i]) # 합하는 데 가중치 미부여 상태
            k.append(sum_k)

        x[name_list[j]] = k
        x = x.drop(sum_list[j],axis=1)
        df_tmp = df_tmp.join(x)
    del(df_tmp[0])
    return df_tmp

In [49]:
# 컬럼 끼리 더한 데이터
y = assembling(tmp_df,sum_list,name_list)
y

Unnamed: 0,TRANSPORT,SECURITY,HEALTH,CONVINIENCE,EDUCATION,PARANTING
0,16.333333,3.0,7.333333,23.000000,13.666667,56.666667
1,18.333333,3.0,17.333333,49.000000,92.666667,51.666667
2,12.000000,1.0,3.000000,11.000000,11.000000,32.000000
3,12.000000,1.0,2.000000,31.000000,7.000000,41.000000
4,37.000000,1.0,6.000000,41.000000,19.000000,21.000000
...,...,...,...,...,...,...
421,14.000000,1.0,12.666667,53.666667,69.666667,68.666667
422,12.000000,1.0,12.666667,53.666667,69.666667,68.666667
423,37.000000,1.0,35.000000,108.000000,139.000000,139.000000
424,6.000000,1.0,11.000000,24.000000,20.000000,32.500000


In [50]:
# 병합하고 컬럼 순서 재정렬하는 함수
def data_frame_redirect(df,asmb_df,sum_list,name_list):
    for i in range(len(sum_list)):
        df = df.drop(sum_list[i],axis=1)
    df = df.join(asmb_df)
    column_list = ['GU','DONG','DONG_CODE'] + name_list +['KIDS_NUM','STARBUCKS_NUM','MC_NUM','NOISE_VIBRATION_NUM','LEISURE_NUM','SPORT_NUM','GYM_NUM','GOLF_NUM','CAR_SHR_NUM','ANI_HSPT_NUM']
    df = df[column_list]
    return df

In [51]:
tmp_df = data_frame_redirect(tmp_df,y,sum_list,name_list) # 묶고 묶을 때 사용한 컬럼 제거한 뒤 정리된 데이터 프레임
tmp_df.columns

Index(['GU', 'DONG', 'DONG_CODE', 'TRANSPORT', 'SECURITY', 'HEALTH',
       'CONVINIENCE', 'EDUCATION', 'PARANTING', 'KIDS_NUM', 'STARBUCKS_NUM',
       'MC_NUM', 'NOISE_VIBRATION_NUM', 'LEISURE_NUM', 'SPORT_NUM', 'GYM_NUM',
       'GOLF_NUM', 'CAR_SHR_NUM', 'ANI_HSPT_NUM'],
      dtype='object')

**Noise Categorizing**

In [52]:
noise = tmp_df['NOISE_VIBRATION_NUM']
noise

index
0      18816.000000
1      22848.000000
2      14112.000000
3       6048.000000
4       4032.000000
           ...     
421     1442.333333
422     1442.333333
423     4327.000000
424     2163.500000
425     2163.500000
Name: NOISE_VIBRATION_NUM, Length: 426, dtype: float64

In [53]:
for i in range(len(noise)):
    if (noise[i]<400):
        noise[i] = 6
    elif (noise[i]>=400 and noise[i]<800):
        noise[i] = 5
    elif (noise[i]>=800 and noise[i]<1200):
        noise[i] = 4
    elif (noise[i]>=1200 and noise[i]<1600):
        noise[i] = 3
    elif (noise[i]>=1600 and noise[i]<3200):
        noise[i] = 2
    else:
        noise[i] = 1
noise

index
0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
421    3.0
422    3.0
423    1.0
424    2.0
425    2.0
Name: NOISE_VIBRATION_NUM, Length: 426, dtype: float64

In [54]:
noise.value_counts()

6.0    95
5.0    89
1.0    80
2.0    72
4.0    56
3.0    34
Name: NOISE_VIBRATION_NUM, dtype: int64

In [17]:
tmp_df['NOISE_VIBRATION_NUM'] = noise
tmp_df

Unnamed: 0_level_0,GU,DONG,DONG_CODE,TRANSPORT,SECURITY,HEALTH,CONVINIENCE,EDUCATION,PARANTING,KIDS_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,LEISURE_NUM,SPORT_NUM,GYM_NUM,GOLF_NUM,CAR_SHR_NUM,ANI_HSPT_NUM
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,종로구,청운효자동,1111051500,20.000000,3.0,16.000000,43.666667,25.0,7.333333,0.000000,0.333333,0.0,1.0,1.000000,0.0,0.0,0.000000,1.000000,2.000000
1,종로구,사직동,1111053000,29.000000,3.0,92.000000,120.666667,21.0,20.333333,0.000000,7.333333,0.0,1.0,1.000000,0.0,18.0,3.000000,5.000000,1.000000
2,종로구,삼청동,1111054000,19.000000,1.0,9.000000,54.000000,4.0,11.000000,0.000000,1.000000,0.0,1.0,0.000000,1.0,3.0,0.000000,1.000000,0.000000
3,종로구,부암동,1111055000,14.000000,1.0,5.000000,38.000000,30.0,5.000000,0.000000,0.000000,0.0,1.0,0.000000,0.0,2.0,2.000000,5.000000,3.000000
4,종로구,평창동,1111056000,44.000000,1.0,19.000000,42.000000,29.0,10.000000,0.000000,1.000000,0.0,1.0,0.000000,0.0,5.0,5.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,강동구,성내2동,1174065000,17.333333,1.0,72.666667,86.333333,65.0,20.666667,0.666667,1.333333,0.0,3.0,1.666667,0.0,10.0,4.666667,6.666667,2.666667
422,강동구,성내3동,1174066000,15.333333,1.0,72.666667,86.333333,65.0,20.666667,0.666667,1.333333,0.0,3.0,1.666667,0.0,10.0,4.666667,6.666667,2.666667
423,강동구,길동,1174068500,38.000000,1.0,155.000000,139.000000,107.0,30.000000,1.000000,2.000000,0.0,1.0,3.000000,0.0,18.0,9.000000,10.000000,8.000000
424,강동구,둔촌1동,1174069000,10.000000,1.0,24.500000,34.500000,42.5,0.000000,0.000000,0.000000,0.5,2.0,0.000000,0.0,4.0,3.000000,2.000000,1.500000


In [39]:
tmp_df[['CCTV_NUM']]

KeyError: "None of [Index(['CCTV_NUM'], dtype='object')] are in the [columns]"

**MinMaxScaling**

In [18]:
df = tmp_df.copy()
df

Unnamed: 0_level_0,GU,DONG,DONG_CODE,TRANSPORT,SECURITY,HEALTH,CONVINIENCE,EDUCATION,PARANTING,KIDS_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,LEISURE_NUM,SPORT_NUM,GYM_NUM,GOLF_NUM,CAR_SHR_NUM,ANI_HSPT_NUM
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,종로구,청운효자동,1111051500,20.000000,3.0,16.000000,43.666667,25.0,7.333333,0.000000,0.333333,0.0,1.0,1.000000,0.0,0.0,0.000000,1.000000,2.000000
1,종로구,사직동,1111053000,29.000000,3.0,92.000000,120.666667,21.0,20.333333,0.000000,7.333333,0.0,1.0,1.000000,0.0,18.0,3.000000,5.000000,1.000000
2,종로구,삼청동,1111054000,19.000000,1.0,9.000000,54.000000,4.0,11.000000,0.000000,1.000000,0.0,1.0,0.000000,1.0,3.0,0.000000,1.000000,0.000000
3,종로구,부암동,1111055000,14.000000,1.0,5.000000,38.000000,30.0,5.000000,0.000000,0.000000,0.0,1.0,0.000000,0.0,2.0,2.000000,5.000000,3.000000
4,종로구,평창동,1111056000,44.000000,1.0,19.000000,42.000000,29.0,10.000000,0.000000,1.000000,0.0,1.0,0.000000,0.0,5.0,5.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,강동구,성내2동,1174065000,17.333333,1.0,72.666667,86.333333,65.0,20.666667,0.666667,1.333333,0.0,3.0,1.666667,0.0,10.0,4.666667,6.666667,2.666667
422,강동구,성내3동,1174066000,15.333333,1.0,72.666667,86.333333,65.0,20.666667,0.666667,1.333333,0.0,3.0,1.666667,0.0,10.0,4.666667,6.666667,2.666667
423,강동구,길동,1174068500,38.000000,1.0,155.000000,139.000000,107.0,30.000000,1.000000,2.000000,0.0,1.0,3.000000,0.0,18.0,9.000000,10.000000,8.000000
424,강동구,둔촌1동,1174069000,10.000000,1.0,24.500000,34.500000,42.5,0.000000,0.000000,0.000000,0.5,2.0,0.000000,0.0,4.0,3.000000,2.000000,1.500000


In [19]:
col = df.columns[3:]
data = df[col]

In [20]:
scaler = MinMaxScaler()
scaler.fit(data)
df_scaled = scaler.transform(data)
df_scaled

array([[0.16040956, 0.66666667, 0.03557814, ..., 0.        , 0.04166667,
        0.16666667],
       [0.25255973, 0.66666667, 0.22871665, ..., 0.09230769, 0.20833333,
        0.08333333],
       [0.15017065, 0.22222222, 0.01778907, ..., 0.        , 0.04166667,
        0.        ],
       ...,
       [0.3447099 , 0.22222222, 0.3888183 , ..., 0.27692308, 0.41666667,
        0.66666667],
       [0.05802048, 0.22222222, 0.05717916, ..., 0.09230769, 0.08333333,
        0.125     ],
       [0.221843  , 0.22222222, 0.05717916, ..., 0.09230769, 0.08333333,
        0.125     ]])

In [21]:
df[col] = df_scaled
data = df[col]

## 2. K-means 군집화

### 2.1 1차 k-means clustering

#### 2.1.1 Elbow 관측

In [None]:
# 차원 축소 및 변환
pca = PCA(n_components=2)
pca_transformed = pca.fit_transform(data)

In [None]:
inertia = [] # 군집의 centre 와 sample 사이의 거리의 제곱의 합
plt.figure(figsize=(10,6))
for i in range(1, 10): # 군집 개수에 따른 
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, random_state=0)
    kmeans.fit(pca_transformed)
    inertia.append(kmeans.inertia_)
plt.plot(range(1, 10), inertia, marker='o')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

#### 2.1.2 k-means, n_cluster = 3

In [None]:
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, random_state=0)
kmeans.fit(pca_transformed)

In [None]:
kmeans.labels_
df = tmp_df.copy()
df['km_cluster'] = kmeans.labels_
df.groupby('km_cluster').count()

In [None]:
df['pca_x'] = pca_transformed[:,0]
df['pca_y'] = pca_transformed[:,1]

# 클러스터별 인덱스 추출
marker_n = len(df['km_cluster'].unique())
markers = []
for i in range(marker_n):
    marker = df[df['km_cluster'] == i].index
    markers.append(marker)

m = ['o','s','^','v','>','<']
for i in range(marker_n):
    plt.scatter(x = df.loc[markers[i], 'pca_x'], y = df.loc[markers[i], 'pca_y'], marker = m[i])

plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title('2 Clusters Visualization by 2 PCA Components')
plt.legend(['cluster0', 'cluster1','cluster2','cluster3','cluster4','cluster5'])
plt.show()

In [None]:
# visualizer 생성
visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick')
# 생성 된 visualizer에 데이터 입력 
visualizer.fit(pca_transformed)      
visualizer.show()
# sil_avg = silhouette_score(data,kmeans.fit(pca_transformed))
# print(sil_avg)

In [None]:
score = silhouette_score(data,df['km_cluster'])
print('Silhouette Score : {0: .3f}'.format(score))

#### 2.1.3 각 군집 별 데이터 확인

**km_cluster == 0**

In [None]:
df_cluster_0 = df[df['km_cluster'] == 0][col]

In [None]:
df_cluster_0.describe()

**km_cluster == 1**

In [None]:
df_cluster_1 = df[df['km_cluster'] == 1][col]

In [None]:
df_cluster_1.describe()

**km_cluster == 2**

In [None]:
df_cluster_2 = df[df['km_cluster'] == 2][col]

In [None]:
df_cluster_2.describe()

### 2.2 2차 k-means_clustering

#### 2.2.1 km_cluster == 0

- Elbow 관측

In [None]:
df_0 = df[df['km_cluster'] == 0]
df_0 = df_0.drop('km_cluster',axis=1)
df_0

In [None]:
col_0 = df_0.columns[3:-2]
data_0 = df_0[col_0].values

In [None]:
pca_transformed_0 = pca.fit_transform(data_0)
df_0['pca_x'] = pca_transformed_0[:,0]
df_0['pca_y'] = pca_transformed_0[:,1]
df_0

In [None]:
inertia = [] # 군집의 centre 와 sample 사이의 거리의 제곱의 합
plt.figure(figsize=(10,6))
for i in range(1, 10): # 군집 개수에 따른 
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, random_state=0)
    kmeans.fit(pca_transformed_0)
    inertia.append(kmeans.inertia_)
plt.plot(range(1, 10), inertia, marker='o')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

- 2차 k-means : n_cluster = 3(Elbow)로 설정

In [None]:
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, random_state=0)
# label
label_0 = kmeans.fit(pca_transformed_0)
# Getting unique labels
u_labels = np.unique(label_0)

In [None]:
df_0['km_cluster'] = kmeans.labels_
df_0.groupby('km_cluster').count()

In [None]:
df_0['pca_x'] = pca_transformed_0[:,0]
df_0['pca_y'] = pca_transformed_0[:,1]

# 클러스터별 인덱스 추출
marker_n = len(df_0['km_cluster'].unique())
markers = []
for i in range(marker_n):
    marker = df_0[df_0['km_cluster'] == i].index
    markers.append(marker)

m = ['o','s','^','v','>']
for i in range(marker_n):
    plt.scatter(x = df_0.loc[markers[i], 'pca_x'], y = df_0.loc[markers[i], 'pca_y'], marker = m[i])

plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title('Clusters Visualization by 2 PCA Components')
plt.legend(['cluster0', 'cluster1','cluster2'])
plt.show()

In [None]:
visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick')
visualizer.fit(pca_transformed_0)      
visualizer.show()

In [None]:
score = silhouette_score(data_0,df_0['km_cluster'])
print('Silhouette Score : {0: .3f}'.format(score))

In [None]:
df_1 = df[df['km_cluster'] == 1]
df_1 = df_1.drop('km_cluster',axis=1)
df_1

In [None]:
col_1 = df_1.columns[3:-2]
data_1 = df_1[col_1].values

In [None]:
pca_transformed_1 = pca.fit_transform(data_1)
df_1['pca_x'] = pca_transformed_1[:,0]
df_1['pca_y'] = pca_transformed_1[:,1]
df_1

In [None]:
inertia = [] # 군집의 centre 와 sample 사이의 거리의 제곱의 합
plt.figure(figsize=(10,6))
for i in range(1, 10): # 군집 개수에 따른 
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, random_state=0)
    kmeans.fit(pca_transformed_1)
    inertia.append(kmeans.inertia_)
plt.plot(range(1, 10), inertia, marker='o')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, random_state=0)
# label
label_1 = kmeans.fit(pca_transformed_1)
# Getting unique labels
u_labels = np.unique(label_1)

In [None]:
df_1['km_cluster'] = kmeans.labels_
df_1.groupby('km_cluster').count()

In [None]:
df_1['pca_x'] = pca_transformed_1[:,0]
df_1['pca_y'] = pca_transformed_1[:,1]

# 클러스터별 인덱스 추출
marker_n = len(df_1['km_cluster'].unique())
markers = []
for i in range(marker_n):
    marker = df_1[df_1['km_cluster'] == i].index
    markers.append(marker)

m = ['o','s','^','v','>']
for i in range(marker_n):
    plt.scatter(x = df_1.loc[markers[i], 'pca_x'], y = df_1.loc[markers[i], 'pca_y'], marker = m[i])

plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title('Clusters Visualization by 2 PCA Components')
plt.legend(['cluster0', 'cluster1','cluster2'])
plt.show()

In [None]:
# visualizer 생성
visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick')
# 생성 된 visualizer에 데이터 입력 
visualizer.fit(pca_transformed_1)      
visualizer.show()

In [None]:
score = silhouette_score(data_1,df_1['km_cluster'])
print('Silhouette Score : {0: .3f}'.format(score))

In [None]:
df_2 = df[df['km_cluster'] == 2]
df_2 = df_2.drop('km_cluster',axis=1)
df_2

In [None]:
col_2 = df_2.columns[3:-2]
data_2 = df_2[col_2].values

In [None]:
pca_transformed_2 = pca.fit_transform(data_2)
df_2['pca_x'] = pca_transformed_2[:,0]
df_2['pca_y'] = pca_transformed_2[:,1]
df_2

In [None]:
inertia = [] # 군집의 centre 와 sample 사이의 거리의 제곱의 합
plt.figure(figsize=(10,6))
for i in range(1, 10): # 군집 개수에 따른 
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, random_state=0)
    kmeans.fit(pca_transformed_2)
    inertia.append(kmeans.inertia_)
plt.plot(range(1, 10), inertia, marker='o')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, random_state=0)
# label
label_2 = kmeans.fit(pca_transformed_2)
# Getting unique labels
u_labels = np.unique(label_2)

In [None]:
df_2['km_cluster'] = kmeans.labels_
df_2.groupby('km_cluster').count()

In [None]:
df_2['pca_x'] = pca_transformed_2[:,0]
df_2['pca_y'] = pca_transformed_2[:,1]

# 클러스터별 인덱스 추출
marker_n = len(df_2['km_cluster'].unique())
markers = []
for i in range(marker_n):
    marker = df_2[df_2['km_cluster'] == i].index
    markers.append(marker)

m = ['o','s','^','v','>']
for i in range(marker_n):
    plt.scatter(x = df_2.loc[markers[i], 'pca_x'], y = df_2.loc[markers[i], 'pca_y'], marker = m[i])

plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title('Clusters Visualization by 2 PCA Components')
plt.legend(['cluster0', 'cluster1','cluster2'])
plt.show()

In [None]:
# visualizer 생성
visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick')
# 생성 된 visualizer에 데이터 입력 
visualizer.fit(pca_transformed_2)      
visualizer.show()

In [None]:
score = silhouette_score(data_2,df_2['km_cluster'])
print('Silhouette Score : {0: .3f}'.format(score))