# k-means를 이용한 클러스터링

In [1]:
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
import chart_studio.plotly as py
import cufflinks as cf

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
%matplotlib inline

plt.style.use('ggplot')
import folium
import googlemaps
import warnings
warnings.filterwarnings('ignore')

print(cf.__version__)

%config InlineBackend.figure_format = 'retina'
cf.go_offline()

0.17.3


In [2]:
pd.set_option('display.max_rows', 50)

In [3]:
plt.rc('font', size=14)
plt.rc('font', family='NanumGothic')

In [4]:
data_rent = pd.read_csv('C:/Users/psuny/Desktop/대학원/데이터/따릉이 데이터/대여소별 월별 이용정보.csv')
data_station = pd.read_csv('C:/Users/psuny/Desktop/대학원/데이터/따릉이 데이터/대여소 정보(21.06월 기준).csv', engine = 'python')

In [5]:
data_rent.dropna(inplace = True)

In [6]:
data_rent.reset_index(drop = True, inplace = True)
data_rent.tail()

Unnamed: 0,대여일자,대여소명,대여건수
72087,202106,1667. 중계중학교,2047
72088,202106,1668. 중계역 6번출구,2586
72089,202106,1669. 중계역 3번출구,2561
72090,202106,1662. 노원역7번출구,2726
72091,202106,4819. 면목동 새싹어린이공원 앞,661


In [7]:
import re

p = re.compile('\d+[. ]')
for row in data_rent.index:
    try:
        data_rent.loc[row, '대여소명'] = p.sub('', data_rent.loc[row, '대여소명'])
    except:
        print(row)

In [8]:
data_rent['대여일자'] = pd.to_datetime(data_rent['대여일자'], format='%Y%m')

In [9]:
data_station['설치시기'] = pd.to_datetime(data_station['설치시기'])

In [10]:
# 데이터 코로나 전후 나누기
## 서울 코로나 발생일이 2020년 1월 23일이므로
## 2018년 1월부터 2020년 1월까지는 beforecovid로
## 2020년 2월부터 2021년 6월까지는 aftercovid로 나누기

def split_before_and_after(df, date):
    before_covid = df[df['대여일자'] < date]
    after_covid = df[df['대여일자'] >= date]
    return before_covid, after_covid

In [11]:
before_covid, after_covid = split_before_and_after(data_rent, '2020-02-01')

In [12]:
before_covid.head()

Unnamed: 0,대여일자,대여소명,대여건수
0,2018-01-01,서교동 사거리,355
1,2018-01-01,한신코아 앞,44
2,2018-01-01,한양수자인아파트 앞,48
3,2018-01-01,홈플러스 금천점 앞,78
4,2018-01-01,빅마켓 금천점 앞,92


In [13]:
# 대여일자를 기준으로 합치기
before_covid.groupby('대여일자').sum().merge(before_covid[['대여일자']], on='대여일자')

Unnamed: 0,대여일자,대여건수
0,2018-01-01,160261
1,2018-01-01,160261
2,2018-01-01,160261
3,2018-01-01,160261
4,2018-01-01,160261
...,...,...
36578,2020-01-01,794147
36579,2020-01-01,794147
36580,2020-01-01,794147
36581,2020-01-01,794147


In [14]:
before_df_group = pd.merge(before_covid.groupby('대여일자').sum(), before_covid[[
                    '대여일자']].drop_duplicates(), on='대여일자', how='left')
before_df_group = before_df_group[['대여일자', '대여건수']]

before_df_group.head()

Unnamed: 0,대여일자,대여건수
0,2018-01-01,160261
1,2018-02-01,164508
2,2018-03-01,436971
3,2018-04-01,633304
4,2018-05-01,924086


In [15]:
df = before_df_group.set_index('대여일자')
df

Unnamed: 0_level_0,대여건수
대여일자,Unnamed: 1_level_1
2018-01-01,160261
2018-02-01,164508
2018-03-01,436971
2018-04-01,633304
2018-05-01,924086
2018-06-01,2414176
2018-07-01,1099953
2018-08-01,991348
2018-09-01,1392038
2018-10-01,1322104


In [16]:
df.iplot(kind = 'scatter', xTitle = '대여일자', yTitle = '대여건수', title = '코로나 전 대여건수')

In [17]:
df['date'] = df.index.date

In [18]:
df.groupby(['date'])[['대여건수']].sum().iplot()

In [19]:
before_covid = before_covid.set_index('대여일자')
before_covid

Unnamed: 0_level_0,대여소명,대여건수
대여일자,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01,서교동 사거리,355
2018-01-01,한신코아 앞,44
2018-01-01,한양수자인아파트 앞,48
2018-01-01,홈플러스 금천점 앞,78
2018-01-01,빅마켓 금천점 앞,92
...,...,...
2020-01-01,하계동 중평어린이공원 앞,763
2020-01-01,노일초등학교 앞,312
2020-01-01,수연빌딩 앞,470
2020-01-01,당고개공원 대여소,565


In [20]:
before_covid['year'] = before_covid.index.year

In [21]:
before_covid['month'] = before_covid.index.month

In [22]:
before_covid.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 36583 entries, 2018-01-01 to 2020-01-01
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   대여소명    36583 non-null  object
 1   대여건수    36583 non-null  int64 
 2   year    36583 non-null  int64 
 3   month   36583 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.4+ MB


In [23]:
before_covid.reset_index(inplace = True)

In [24]:
del before_covid['대여일자']

In [25]:
before_covid

Unnamed: 0,대여소명,대여건수,year,month
0,서교동 사거리,355,2018,1
1,한신코아 앞,44,2018,1
2,한양수자인아파트 앞,48,2018,1
3,홈플러스 금천점 앞,78,2018,1
4,빅마켓 금천점 앞,92,2018,1
...,...,...,...,...
36578,하계동 중평어린이공원 앞,763,2020,1
36579,노일초등학교 앞,312,2020,1
36580,수연빌딩 앞,470,2020,1
36581,당고개공원 대여소,565,2020,1


In [26]:
before_covid = pd.get_dummies(before_covid, columns = ['대여소명'], drop_first = True)

In [27]:
before_covid

Unnamed: 0,대여건수,year,month,대여소명_ 둔촌역 3번 출입구,대여소명_ (구)신한은행 중랑교지점,대여소명_ (구)합정동 주민센터,대여소명_ 19민주묘지역,대여소명_ 3호선 매봉역 3번출구앞,대여소명_ 9호선종합운동장역 9번출구,대여소명_ CJ 드림시티,...,대여소명_양재전화국 사거리,대여소명_양재초등학교 맞은편,대여소명_위트콤,대여소명_이동정비,대여소명_이수역 4번 출구,대여소명_자양중앙나들목,대여소명_중랑센터,대여소명_중랑센터2,대여소명_청계산입구역 1번출구,대여소명_하나은행 방배동지점 앞
0,355,2018,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,44,2018,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,48,2018,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,78,2018,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,92,2018,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36578,763,2020,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36579,312,2020,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36580,470,2020,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36581,565,2020,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# elbow method를 사용해 최적의 k찾기

distortions = [] 
k_range = range(1,10)

for i in k_range:
  km = KMeans(n_clusters = i, random_state=42)
  km.fit(before_covid)
  distortions.append(km.inertia_)

print(distortions)

[27838498498.056583, 11995565537.167332, 6704768944.129062, 4238298566.697255, 2906309378.524639, 2067201187.0859225, 1542206651.065568, 1200968163.6382632, 960714392.4798298]


In [33]:
# elbow method를 그래프로 확인해보세요.
# x축이 k의 수, y축이 distortions인 line plot을 그려봅시다.
import plotly.express as px

fig = px.line(x=k_range, y=distortions, labels={"x":"k", "y":"distortions"})
fig.update_layout(width=800, height=500)
fig.show()

In [34]:
km = KMeans(n_clusters = 4)
km.fit(before_covid)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [35]:
# 각 군집의 변수별 중심값을 비교해보세요.
# [참고] 변수별 중심값은 km.cluster_centers_ 를 활용하세요.

pd.DataFrame(km.cluster_centers_, columns=before_covid.columns)

Unnamed: 0,대여건수,year,month,대여소명_ 둔촌역 3번 출입구,대여소명_ (구)신한은행 중랑교지점,대여소명_ (구)합정동 주민센터,대여소명_ 19민주묘지역,대여소명_ 3호선 매봉역 3번출구앞,대여소명_ 9호선종합운동장역 9번출구,대여소명_ CJ 드림시티,...,대여소명_양재전화국 사거리,대여소명_양재초등학교 맞은편,대여소명_위트콤,대여소명_이동정비,대여소명_이수역 4번 출구,대여소명_자양중앙나들목,대여소명_중랑센터,대여소명_중랑센터2,대여소명_청계산입구역 1번출구,대여소명_하나은행 방배동지점 앞
0,372.535062,2018.545241,5.847616,4.350096e-05,0.0008265182,0.0007395163,0.0004785105,0.0005655124,0.0003045067,8.7e-05,...,0.001131025,0.001044023,4.350096e-05,8.700191e-05,0.0003045067,0.0003480077,0.0006090134,0.0002175048,0.0003480077,0.0006525144
1,2646.165083,2018.764872,7.594861,0.001759944,-3.794708e-17,-3.794708e-17,-3.426079e-17,-3.794708e-17,0.001759944,0.00352,...,-3.794708e-17,-3.794708e-17,1.629691e-18,3.259383e-18,0.002111932,0.002111932,-9.269929e-18,-8.565197e-18,-3.794708e-17,-3.794708e-17
2,6234.067669,2018.81203,7.635338,-7.589414999999999e-19,-4.119968e-18,-4.119968e-18,1.301043e-18,-4.119968e-18,-4.119968e-18,0.022556,...,-4.119968e-18,-4.119968e-18,1.3552529999999999e-19,2.710505e-19,-4.119968e-18,-4.119968e-18,-1.6263029999999999e-19,3.2526069999999995e-19,-4.119968e-18,-4.119968e-18
3,1226.860317,2018.632914,7.318555,0.0002860412,0.0006674294,0.0008581236,0.0008581236,0.001239512,0.001334859,0.000763,...,-7.556889e-17,0.0001906941,-9.520649999999999e-19,-1.90413e-18,0.001239512,0.001144165,-1.1275700000000001e-17,-9.215718e-18,0.001716247,0.001048818


In [36]:
# 훈련된 모델의 label을 저장해주세요.

before_covid["label_from_km"] = km.labels_
before_covid.groupby(["label_from_km"]).count()

Unnamed: 0_level_0,대여건수,year,month,대여소명_ 둔촌역 3번 출입구,대여소명_ (구)신한은행 중랑교지점,대여소명_ (구)합정동 주민센터,대여소명_ 19민주묘지역,대여소명_ 3호선 매봉역 3번출구앞,대여소명_ 9호선종합운동장역 9번출구,대여소명_ CJ 드림시티,...,대여소명_양재전화국 사거리,대여소명_양재초등학교 맞은편,대여소명_위트콤,대여소명_이동정비,대여소명_이수역 4번 출구,대여소명_자양중앙나들목,대여소명_중랑센터,대여소명_중랑센터2,대여소명_청계산입구역 1번출구,대여소명_하나은행 방배동지점 앞
label_from_km,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,22988,22988,22988,22988,22988,22988,22988,22988,22988,22988,...,22988,22988,22988,22988,22988,22988,22988,22988,22988,22988
1,2841,2841,2841,2841,2841,2841,2841,2841,2841,2841,...,2841,2841,2841,2841,2841,2841,2841,2841,2841,2841
2,266,266,266,266,266,266,266,266,266,266,...,266,266,266,266,266,266,266,266,266,266
3,10488,10488,10488,10488,10488,10488,10488,10488,10488,10488,...,10488,10488,10488,10488,10488,10488,10488,10488,10488,10488


In [39]:
before_covid

Unnamed: 0,대여건수,year,month,대여소명_ 둔촌역 3번 출입구,대여소명_ (구)신한은행 중랑교지점,대여소명_ (구)합정동 주민센터,대여소명_ 19민주묘지역,대여소명_ 3호선 매봉역 3번출구앞,대여소명_ 9호선종합운동장역 9번출구,대여소명_ CJ 드림시티,...,대여소명_양재초등학교 맞은편,대여소명_위트콤,대여소명_이동정비,대여소명_이수역 4번 출구,대여소명_자양중앙나들목,대여소명_중랑센터,대여소명_중랑센터2,대여소명_청계산입구역 1번출구,대여소명_하나은행 방배동지점 앞,label_from_km
0,355,2018,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,44,2018,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,48,2018,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,78,2018,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,92,2018,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36578,763,2020,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36579,312,2020,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36580,470,2020,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36581,565,2020,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# 훈련결과를 시각화로 확인해보세요.
# [참고] 중심값 차이가 큰 변수를 사용하면 차이를 더 확실히 확인할 수 있습니다.
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=1, cols=2, subplot_titles=("Actual-test","K-means cluster"))

fig.add_trace(
    go.Scatter(x=before_covid["year"], 
               y=before_covid["대여건수"], 
               mode="markers",
               ),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=before_covid['year'], 
               y=before_covid["대여건수"], 
               mode="markers",
               marker=dict(color=before_covid["label_from_km"]),
               ),
    row=1, col=2
)

fig.update_layout(height=600, width=800)
fig.show()

In [28]:
after_covid.head()

Unnamed: 0,대여일자,대여소명,대여건수
36583,2020-02-01,거여역 3번출구,525
36584,2020-02-01,방이삼거리,1193
36585,2020-02-01,롯데월드타워(잠실역2번출구 쪽),2574
36586,2020-02-01,잠실나루역 (2번 출구 쪽),1617
36587,2020-02-01,풍납현대아파트쉼터,914


In [47]:
# 대여일자를 기준으로 합치기
after_covid.groupby('대여일자').sum().merge(after_covid[['대여일자']], on='대여일자')

after_df_group = pd.merge(after_covid.groupby('대여일자').sum(), after_covid[[
                    '대여일자']].drop_duplicates(), on='대여일자', how='left')
after_df_group = after_df_group[['대여일자', '대여건수']]

df2 = after_df_group.set_index('대여일자')
df2

df2.iplot(kind = 'scatter', xTitle = '대여일자', yTitle = '대여건수', title = '코로나 후 대여건수')

In [29]:
data_station.head()

Unnamed: 0,대여소ID,대여소명,대여소_구,대여소주소,위도,경도,설치시기,거치대수
0,102,망원역 1번출구 앞,마포구,서울특별시 마포구 월드컵로 72,37.555649,126.910629,2015-09-06,20
1,103,망원역 2번출구 앞,마포구,서울특별시 마포구 월드컵로 79,37.554951,126.910835,2015-09-06,14
2,104,합정역 1번출구 앞,마포구,서울특별시 마포구 양화로 59,37.550629,126.914986,2015-09-06,13
3,105,합정역 5번출구 앞,마포구,서울특별시 마포구 양화로 48,37.550007,126.914825,2015-09-06,5
4,106,합정역 7번출구 앞,마포구,서울특별시 마포구 독막로 4,37.548645,126.912826,2015-09-06,10
