In [1]:
import pandas as pd
import json
import folium
import seaborn as sns
from sklearn.cluster import KMeans

In [2]:
# 클러스터링한 군집이 표시되어 있는 파일 불러오기
kmean = pd.read_csv('output/k-mean.csv')
kmean = kmean.iloc[:, range(1,5)]
display(kmean.head())

Unnamed: 0,val,위도,경도,0
0,5379.0,37.509659,126.767363,28
1,4169.0,37.494231,126.750545,13
2,3789.0,37.510539,126.763959,27
3,3438.0,37.509624,126.761706,27
4,3211.0,37.502449,126.767433,2


In [3]:
# 알기쉽게 컬럼명 변경
kmean.columns = ['인구밀도', '위도', '경도', '군집']

In [4]:
# 군집별로 위도와 경도의 평균 계산 (군집의 중앙 찾기 위해)
kmean_cluster = kmean.groupby('군집').mean().loc[:, ['위도', '경도']]
kmean_cluster = kmean_cluster.reset_index()
kmean_cluster.columns = ['군집', '위도_평균', '경도_평균']

In [5]:
# 군집별로 위도와 경도의 분산 계산 (너무 멀리 퍼져있는 군집은 다시 분할할지 판단 위해)
kmean_cluster2 = kmean.groupby('군집').std().loc[:, ['위도', '경도']]
kmean_cluster2 = kmean_cluster2.reset_index()
kmean_cluster2.columns = ['군집', '위도_분산', '경도_분산']

In [6]:
# 군집별 개수 계산 (군집 내 수가 많은 경우 다시 분할할지 판단)
kmean_cluster3 = kmean.groupby('군집').count()['인구밀도']
kmean_cluster3 = kmean_cluster3.reset_index()
kmean_cluster3.columns = ['군집', '군집수']

In [7]:
display(kmean_cluster.head())
display(kmean_cluster2.head())
display(kmean_cluster3.head())

Unnamed: 0,군집,위도_평균,경도_평균
0,0,37.497041,126.767486
1,1,37.507009,126.776117
2,2,37.50127,126.771216
3,3,37.50144,126.750386
4,4,37.514374,126.770354


Unnamed: 0,군집,위도_분산,경도_분산
0,0,0.001213,0.00182
1,1,0.001807,0.002324
2,2,0.001692,0.001745
3,3,0.001984,0.001626
4,4,0.00216,0.002821


Unnamed: 0,군집,군집수
0,0,11
1,1,7
2,2,15
3,3,13
4,4,19


In [8]:
# 군집별 위도·경도의 평균 분산 군집수를 하나의 df로 합치기
kmean_cluster = pd.merge(left = kmean_cluster, right = kmean_cluster2, how = "inner", on = "군집")
kmean_cluster = pd.merge(left = kmean_cluster, right = kmean_cluster3, how = "inner", on = "군집")
kmean_cluster.head()

Unnamed: 0,군집,위도_평균,경도_평균,위도_분산,경도_분산,군집수
0,0,37.497041,126.767486,0.001213,0.00182,11
1,1,37.507009,126.776117,0.001807,0.002324,7
2,2,37.50127,126.771216,0.001692,0.001745,15
3,3,37.50144,126.750386,0.001984,0.001626,13
4,4,37.514374,126.770354,0.00216,0.002821,19


In [9]:
# 위도·경도의 평균 분산 군집수의 데이터 확인
kmean_cluster.describe()

Unnamed: 0,군집,위도_평균,경도_평균,위도_분산,경도_분산,군집수
count,50.0,50.0,50.0,50.0,50.0,50.0
mean,24.5,37.495316,126.784072,0.001901,0.001916,12.78
std,14.57738,0.017148,0.020348,0.000633,0.000551,5.159853
min,0.0,37.462601,126.749429,0.00063,0.000838,5.0
25%,12.25,37.483237,126.768248,0.001436,0.001597,9.25
50%,24.5,37.494769,126.784217,0.001849,0.001937,11.5
75%,36.75,37.506944,126.801162,0.002255,0.00224,15.0
max,49.0,37.528551,126.822331,0.003443,0.003172,28.0


In [10]:
# 위도분산 또는 경도분산이 75% 이상인 경우
var_list = list(kmean_cluster[(kmean_cluster['위도_분산'] >= 0.002255) |
                              (kmean_cluster['경도_분산'] >= 0.002240)]['군집'])

In [11]:
# 클러스터의 군집수가 15개 이상인 경우
num_list = list(kmean_cluster[(kmean_cluster['위도_분산'] < 0.002255) &
                              (kmean_cluster['경도_분산'] < 0.002240) &
                              (kmean_cluster['군집수'] >= 15)]['군집'])

In [12]:
# 위 모두 속하지 않는 경우
other_list = list(set(range(50)) - (set(var_list) | set(num_list)))

In [13]:
kmean_test1 = kmean[kmean['군집'].apply(lambda x: x in var_list)]
kmean_test2 = kmean[kmean['군집'].apply(lambda x: x in num_list)]
kmean_test3 = kmean[kmean['군집'].apply(lambda x: (x not in num_list) & (x not in var_list))]

In [14]:
print(len(var_list))
print(len(num_list))
print(len(other_list))

22
9
19


In [15]:
# 위도분산 또는 경도분산이 75% 이상인 경우의 시각화

color = sns.color_palette('hls', 22)
colors = color.as_hex()

# 부천 지도 만들기
geo_data = "data/HangJeongDong_bucheon2.json"

with open(geo_data,encoding="utf-8") as f:
    geo_data = json.loads(f.read())

bucheon_map = folium.Map(location=[37.50554861215234,126.77550612183495], zoom_start=13)

for i in range(22):
    j = i + 1
    for name, lat, lng in zip(kmean_test1[kmean_test1['군집'] == var_list[i]].군집,
                              kmean_test1[kmean_test1['군집'] == var_list[i]].위도,
                              kmean_test1[kmean_test1['군집'] == var_list[i]].경도):
        folium.CircleMarker([lat, lng],
                            radius=4.5,         # 원의 반지름
                            fill=True,
                            color = colors[21-i],   # 테두리 색
                            fill_color= colors[i],    # 원을 채우는 색
                            fill_opacity=0.7, # 투명도    
                            popup=name
        ).add_to(bucheon_map)
    
fmap=folium.Choropleth(geo_data = geo_data,
                       color = "black",
                       nan_fill_color='black',
                       fill_opacity=0.1,
                       fill_color = None).add_to(bucheon_map)

display(bucheon_map)

In [16]:
# 클러스터의 군집수가 15개(상위 25%) 이상인 경우의 시각화
color = sns.color_palette('hls', 9)
colors = color.as_hex()

# 부천 지도 만들기
geo_data = "data/HangJeongDong_bucheon2.json"

with open(geo_data,encoding="utf-8") as f:
    geo_data = json.loads(f.read())

bucheon_map = folium.Map(location=[37.50554861215234,126.77550612183495], zoom_start=13)

for i in range(9):
    j = i + 1
    for name, lat, lng in zip(kmean_test2[kmean_test2['군집'] == num_list[i]].군집,
                              kmean_test2[kmean_test2['군집'] == num_list[i]].위도,
                              kmean_test2[kmean_test2['군집'] == num_list[i]].경도):
        folium.CircleMarker([lat, lng],
                            radius=4.5,         # 원의 반지름
                            fill=True,
                            color = colors[8-i],   # 테두리 색
                            fill_color= colors[i],    # 원을 채우는 색
                            fill_opacity=0.7, # 투명도    
                            popup=name
        ).add_to(bucheon_map)
    
fmap=folium.Choropleth(geo_data = geo_data,
                       color = "black",
                       nan_fill_color='black',
                       fill_opacity=0.1,
                       fill_color = None).add_to(bucheon_map)

display(bucheon_map)

In [17]:
# 위에 2가지 경우에 속하지 않는 경우의 중심좌표 df
kmean_center1 = kmean_cluster[kmean_cluster['군집'].apply(lambda x: (x not in num_list) & (x not in var_list))]
kmean_center1.head()

Unnamed: 0,군집,위도_평균,경도_평균,위도_분산,경도_분산,군집수
0,0,37.497041,126.767486,0.001213,0.00182,11
3,3,37.50144,126.750386,0.001984,0.001626,13
5,5,37.496461,126.787226,0.001782,0.001979,9
6,6,37.490873,126.790673,0.001275,0.001798,9
10,10,37.480927,126.798087,0.001979,0.002178,12


In [18]:
# 양쪽 모두 속하지 않는 그룹의 시각화 (검은색은 중심좌표)

color = sns.color_palette('hls', 19)
colors = color.as_hex()

# 부천 지도 만들기
geo_data = "data/HangJeongDong_bucheon2.json"

with open(geo_data,encoding="utf-8") as f:
    geo_data = json.loads(f.read())

bucheon_map = folium.Map(location=[37.50554861215234,126.77550612183495], zoom_start=13)

for i in range(19):
    j = i + 1
    for name, lat, lng in zip(kmean_test3[kmean_test3['군집'] == other_list[i]].군집,
                              kmean_test3[kmean_test3['군집'] == other_list[i]].위도,
                              kmean_test3[kmean_test3['군집'] == other_list[i]].경도):
        folium.CircleMarker([lat, lng],
                            radius=4.5,         # 원의 반지름
                            fill=True,
                            color = colors[18-i],   # 테두리 색
                            fill_color= colors[i],    # 원을 채우는 색
                            fill_opacity=0.7, # 투명도    
                            popup=name
        ).add_to(bucheon_map)

for i in range(19):
    j = i + 1
    for name, lat, lng in zip(kmean_center1[kmean_center1['군집'] == other_list[i]].군집,
                              kmean_center1[kmean_center1['군집'] == other_list[i]].위도_평균,
                              kmean_center1[kmean_center1['군집'] == other_list[i]].경도_평균):
        folium.CircleMarker([lat, lng],
                            radius=5.5,         # 원의 반지름
                            fill=True,
                            color = "black",   # 테두리 색
                            fill_color= "black",    # 원을 채우는 색
                            fill_opacity=0.7, # 투명도    
                            popup=name
        ).add_to(bucheon_map)

    
fmap=folium.Choropleth(geo_data = geo_data,
                       color = "black",
                       nan_fill_color='black',
                       fill_opacity=0.1,
                       fill_color = None).add_to(bucheon_map)

display(bucheon_map)

In [19]:
kmean_test1.head(5)

Unnamed: 0,인구밀도,위도,경도,군집
2,3789.0,37.510539,126.763959,27
3,3438.0,37.509624,126.761706,27
7,2751.0,37.512271,126.752627,11
9,2466.0,37.51501,126.758258,9
10,2335.0,37.508687,126.756057,9


In [20]:
# 각 군집별로 다시 2개의 군집으로 나누는 함수

def clustering(df, mylist):
    sr = pd.Series(name = '군집2')
    for i in range(len(mylist)):
        temp = df[df['군집'] == mylist[i]]
        kmeans = KMeans(init = 'random', n_clusters = 2)
        kmeans.fit(temp[['위도', '경도']])
        y_pred = kmeans.predict(temp[['위도', '경도']])
        temp.loc[:, '군집2'] = y_pred
        temp = temp.loc[:, '군집2']
        sr = pd.concat([sr, temp])
    df = df.join(sr, how='outer')
    return df

In [21]:
# 위도 경도의 분산 상위 25%인 경우를 다시 2개의 군집으로 클러스터링
kmean_test1 = clustering(kmean_test1, var_list)

  sr = pd.Series(name = '군집2')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.


In [22]:
# 클러스터에 속한 후보 개수가 상위 25%인 경우를 다시 2개의 군집으로 클러스터링
kmean_test2 = clustering(kmean_test2, num_list)

  sr = pd.Series(name = '군집2')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.


In [23]:
kmean_test1['군집'] = kmean_test1['군집'].apply(lambda x: str(x)) + '_' + kmean_test1['군집2'].apply(lambda x: str(x))
kmean_test1.head()

Unnamed: 0,인구밀도,위도,경도,군집,군집2
2,3789.0,37.510539,126.763959,27_1,1
3,3438.0,37.509624,126.761706,27_1,1
7,2751.0,37.512271,126.752627,11_0,0
9,2466.0,37.51501,126.758258,9_0,0
10,2335.0,37.508687,126.756057,9_1,1


In [24]:
kmean_test2['군집'] = kmean_test2['군집'].apply(lambda x: str(x)) + '_' + kmean_test2['군집2'].apply(lambda x: str(x))
kmean_test2.head()

Unnamed: 0,인구밀도,위도,경도,군집,군집2
0,5379.0,37.509659,126.767363,28_1,1
1,4169.0,37.494231,126.750545,13_1,1
4,3211.0,37.502449,126.767433,2_0,0
6,2795.0,37.493436,126.767521,20_0,0
8,2701.0,37.499773,126.771984,2_1,1


In [25]:
# 모든 후보군의 위도 경도 군집 df
kmean_fin = pd.concat([kmean[kmean['군집'].apply(lambda x: (x not in num_list) & (x not in var_list))], kmean_test1, kmean_test2])
kmean_fin = kmean_fin.iloc[:, range(4)]
kmean_fin.head()

Unnamed: 0,인구밀도,위도,경도,군집
5,2910.0,37.499624,126.748228,3
16,1925.0,37.498737,126.7505,3
21,1817.0,37.500554,126.752744,3
26,1471.0,37.503236,126.749323,3
27,1435.0,37.503244,126.750455,3


In [26]:
# 모든 후보군의 군집 중심좌표 df
kmean_fin_cluster = kmean_fin.groupby('군집').mean().loc[:, ['위도', '경도']]
kmean_fin_cluster = kmean_fin_cluster.reset_index()
kmean_fin_cluster.columns = ['군집', '위도_평균', '경도_평균']
kmean_fin_cluster

Unnamed: 0,군집,위도_평균,경도_평균
0,0,37.497041,126.767486
1,3,37.501440,126.750386
2,5,37.496461,126.787226
3,6,37.490873,126.790673
4,10,37.480927,126.798087
...,...,...,...
76,7_1,37.524297,126.810733
77,8_0,37.467883,126.800565
78,8_1,37.469997,126.804220
79,9_0,37.514329,126.757416


In [27]:
# 총 군집이 81개 (분산과 개수로 다시 클러스터링한 경우) 시각화
color1 = sns.color_palette('Set1', 81)
colors1 = color1.as_hex()
color2 = sns.color_palette('Set3', 81)
colors2 = color2.as_hex()

# 부천 지도 만들기
geo_data = "data/HangJeongDong_bucheon2.json"

with open(geo_data,encoding="utf-8") as f:
    geo_data = json.loads(f.read())

bucheon_map = folium.Map(location=[37.50554861215234,126.77550612183495], zoom_start=13)

for i in range(81):
    for name, lat, lng in zip(kmean_fin[kmean_fin['군집'] == kmean_fin_cluster['군집'][i]].군집,
                              kmean_fin[kmean_fin['군집'] == kmean_fin_cluster['군집'][i]].위도,
                              kmean_fin[kmean_fin['군집'] == kmean_fin_cluster['군집'][i]].경도):
        folium.CircleMarker([lat, lng],
                            radius=4.5,         # 원의 반지름
                            fill=True,
                            color = colors1[i],   # 테두리 색
                            fill_color= colors2[i],    # 원을 채우는 색
                            fill_opacity=0.7, # 투명도    
                            popup=name
        ).add_to(bucheon_map)

for i in range(81):
    for name, lat, lng in zip(kmean_fin_cluster[kmean_fin_cluster['군집'] == kmean_fin_cluster['군집'][i]].군집,
                              kmean_fin_cluster[kmean_fin_cluster['군집'] == kmean_fin_cluster['군집'][i]].위도_평균,
                              kmean_fin_cluster[kmean_fin_cluster['군집'] == kmean_fin_cluster['군집'][i]].경도_평균):
        folium.CircleMarker([lat, lng],
                            radius=5.5,         # 원의 반지름
                            fill=True,
                            color = "black",   # 테두리 색
                            fill_color= "black",    # 원을 채우는 색
                            fill_opacity=0.7, # 투명도    
                            popup=name
        ).add_to(bucheon_map)

    
fmap=folium.Choropleth(geo_data = geo_data,
                       color = "black",
                       nan_fill_color='black',
                       fill_opacity=0.1,
                       fill_color = None).add_to(bucheon_map)

display(bucheon_map)

bucheon_map.save('output/cluster80.html')

In [28]:
# 총 군집이 50개 (초기에 k-mean 클러스터링한 것) 시각화
color1 = sns.color_palette('Set1', 50)
colors1 = color1.as_hex()
color2 = sns.color_palette('Set3', 50)
colors2 = color2.as_hex()

# 부천 지도 만들기
geo_data = "data/HangJeongDong_bucheon2.json"

with open(geo_data,encoding="utf-8") as f:
    geo_data = json.loads(f.read())

bucheon_map = folium.Map(location=[37.50554861215234,126.77550612183495], zoom_start=13)

for i in range(50):
    for name, lat, lng in zip(kmean[kmean['군집'] == kmean_cluster['군집'][i]].군집,
                              kmean[kmean['군집'] == kmean_cluster['군집'][i]].위도,
                              kmean[kmean['군집'] == kmean_cluster['군집'][i]].경도):
        folium.CircleMarker([lat, lng],
                            radius=4.5,         # 원의 반지름
                            fill=True,
                            color = colors1[i],   # 테두리 색
                            fill_color= colors2[i],    # 원을 채우는 색
                            fill_opacity=0.7, # 투명도    
                            popup=name
        ).add_to(bucheon_map)

for i in range(50):
    for name, lat, lng in zip(kmean_cluster[kmean_cluster['군집'] == kmean_cluster['군집'][i]].군집,
                              kmean_cluster[kmean_cluster['군집'] == kmean_cluster['군집'][i]].위도_평균,
                              kmean_cluster[kmean_cluster['군집'] == kmean_cluster['군집'][i]].경도_평균):
        folium.CircleMarker([lat, lng],
                            radius=5.5,         # 원의 반지름
                            fill=True,
                            color = "black",   # 테두리 색
                            fill_color= "black",    # 원을 채우는 색
                            fill_opacity=0.7, # 투명도    
                            popup=name
        ).add_to(bucheon_map)

    
fmap=folium.Choropleth(geo_data = geo_data,
                       color = "black",
                       nan_fill_color='black',
                       fill_opacity=0.1,
                       fill_color = None).add_to(bucheon_map)

display(bucheon_map)

bucheon_map.save('output/cluster50.html')

In [33]:
kmean_cluster.to_csv('output/cluster50.csv')

In [34]:
kmean_fin_cluster.to_csv('output/cluster80.csv')