### 1. 각 군집의 centroid 부터 거리 계산 (주성분 활용)

In [1]:
import pandas as pd
import numpy as np

pca_result = pd.read_csv('./data/minmax_scaling_PCA_label.csv', index_col=0)
subway_data = pd.read_csv('./data/서울지하철_공공데이터_호선별_역별_승하차인원_일평균_행정동포함.csv')
subway_data['호선+역명'] = subway_data['호선'] + '_' + subway_data['역명'] + " "

# kmeans centroid numpy array 불러오기
centroids = np.load('./data/kmeans_centroid.npy')

In [2]:
# 클러스터 레이블 별로 데이터프레임 나누기
pca_cluster_0 = pca_result[pca_result['cluster'] == 0]
pca_cluster_1 = pca_result[pca_result['cluster'] == 1]
pca_cluster_2 = pca_result[pca_result['cluster'] == 2]
pca_cluster_3 = pca_result[pca_result['cluster'] == 3]

In [3]:
# 거리 계산 함수 생성
def calculate_distance(a, b):
    distance = 0
    for i in range(len(a)):
        distance += (a[i] - b[i]) ** 2
    return distance ** 0.5

In [4]:
# 오류 메시지 비활성화
import pandas as pd
pd.options.mode.chained_assignment = None

In [5]:
# 각 cluster centroid로부터 떨어진 거리 계산 -> 'distance' 컬럼
pca_clusters = [pca_cluster_0, pca_cluster_1, pca_cluster_2, pca_cluster_3]

for i, pca_cluster in enumerate(pca_clusters):
    tmp = pca_cluster.drop('cluster', axis=1)
    distances = [calculate_distance(centroids[i], point) for point in tmp.values]
    pca_cluster.loc[:,'distance'] = distances

클러스터별 최단 거리 순위 뽑아보기

In [6]:
# 클러스터별 최단 거리 상위 5개 행정동 추출
top_5 = pd.concat([
    pca_cluster_0.sort_values('distance').head(5),
    pca_cluster_1.sort_values('distance').head(5),
    pca_cluster_2.sort_values('distance').head(5),
    pca_cluster_3.sort_values('distance').head(5),
])

In [7]:
# 추출된 행정동에 위치한 지하철역 mapping
area_w_subway = subway_data.groupby('행정동')['호선+역명'].sum().to_frame()

for i in top_5.index:
    for j in area_w_subway.index:
        if i == j:
            top_5.loc[i, 'subway'] = area_w_subway.loc[j, '호선+역명']

top_5['subway'].fillna("", inplace=True)
top_5['subway'] = top_5['subway'].map(lambda x: ','.join(x.split()))
top_5

Unnamed: 0_level_0,0,1,2,3,4,5,cluster,distance,subway
행정동명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
송파2동,0.079815,-0.24463,0.027826,0.015311,-0.039013,-0.010445,0,0.143294,
남가좌1동,0.088593,-0.312289,0.017162,-0.16712,0.047328,0.041055,0,0.15883,경의중앙선_가좌
하계2동,0.164587,-0.306063,-0.004596,-0.099306,-0.003715,-0.035986,0,0.163167,7호선_하계
방이1동,-0.05698,-0.250338,0.119408,-0.080721,-0.087399,0.055004,0,0.168104,5호선_방이
방배3동,0.100324,-0.300655,0.057266,0.06177,0.054492,0.053401,0,0.17373,2호선_방배
면목4동,-0.075829,-0.078424,-0.16365,0.052472,0.042259,0.000171,1,0.089658,7호선_용마산
암사1동,-0.065351,-0.064685,-0.098996,-0.018784,0.066846,0.051345,1,0.093978,8호선_암사
홍제3동,-0.044947,-0.10227,-0.078139,0.104911,0.052648,-0.021009,1,0.110024,3호선_홍제
화곡2동,-0.077616,-0.105319,-0.091817,-0.04653,-0.033173,0.018272,1,0.112856,
왕십리2동,-0.095186,-0.099579,-0.080063,-0.03063,-0.046733,0.044464,1,0.11309,2호선_상왕십리


In [8]:
top_5.to_csv('./data/optimal_location_w_subway_station.csv')

### 2. stanine을 이용해서 클러스터별 등급 매기기

In [9]:
import pandas as pd
import numpy as np

pca_result = pd.read_csv('./data/minmax_scaling_PCA_label.csv', index_col=0)
scaled_df = pd.read_csv('./data/minmax_scaling.csv', index_col=0)
original_df = pd.read_csv('./data/final_data_after_winsorization_with_selected_features.csv', index_col=0)

## 최종 입지 선정

In [13]:
pca_cluster_3

Unnamed: 0_level_0,0,1,2,3,4,5,cluster,distance
행정동명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
가리봉동,-0.612605,0.107408,-0.156476,0.051506,-0.025351,-0.021620,3,0.468584
가산동,-0.117360,0.795470,0.181084,-0.224243,0.009892,-0.060995,3,0.566771
광희동,-0.557756,0.354156,0.122425,-0.089660,0.216152,0.137628,3,0.420802
구로3동,-0.366267,0.147160,0.216341,0.063184,-0.176238,-0.113697,3,0.309302
구로5동,-0.253796,0.123504,0.062825,-0.058777,-0.028850,-0.014961,3,0.169749
...,...,...,...,...,...,...,...,...
화양동,-0.251832,0.675537,0.019692,-0.315194,-0.301797,-0.067263,3,0.539773
황학동,-0.318490,0.160926,-0.209637,-0.193470,0.016964,0.065220,3,0.357892
회기동,-0.450670,0.145913,0.164511,-0.015268,-0.382633,-0.112595,3,0.441333
회현동,-0.514809,0.151396,0.289890,0.194440,0.439240,-0.154272,3,0.663796
