# 고객 유형 분류
## KMeans Clustering

고객 정보와 수입, 소비 데이터를 활용하여 
KMeans Clustering을 통해 유형을 분류한다. 

## 모듈 및 데이터 로드

In [116]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [117]:
df = pd.read_csv('Mall_Customers.csv', index_col = 0)

In [118]:
df.head()

Unnamed: 0_level_0,Gender,Age,Annual Income (k$),Spending Score (1-100)
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Male,19,15,39
2,Male,21,15,81
3,Female,20,16,6
4,Female,23,16,77
5,Female,31,17,40


In [119]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Gender                  200 non-null    object
 1   Age                     200 non-null    int64 
 2   Annual Income (k$)      200 non-null    int64 
 3   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 7.8+ KB


In [120]:
df.describe()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200.0,200.0
mean,38.85,60.56,50.2
std,13.969007,26.264721,25.823522
min,18.0,15.0,1.0
25%,28.75,41.5,34.75
50%,36.0,61.5,50.0
75%,49.0,78.0,73.0
max,70.0,137.0,99.0


## 카테고리 변수 처리

In [121]:
df = pd.get_dummies(df, columns=['Gender'], drop_first=True)

In [122]:
df['Gender_Male'] = df['Gender_Male'].astype('int64')
df

Unnamed: 0_level_0,Age,Annual Income (k$),Spending Score (1-100),Gender_Male
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,19,15,39,1
2,21,15,81,1
3,20,16,6,0
4,23,16,77,0
5,31,17,40,0
...,...,...,...,...
196,35,120,79,0
197,45,126,28,0
198,32,126,74,1
199,32,137,18,1


## KMeans Clustering 모델링

In [123]:
from sklearn.cluster import KMeans

In [124]:
model = KMeans(n_clusters=3)

In [125]:
model.fit(df)

In [126]:
model.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1, 2,
       1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
       1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
       1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
       1, 2])

## 분석 결과 Summary

In [127]:
result_df = df.copy()

In [128]:
result_df['label'] = model.labels_

In [129]:
result_df

Unnamed: 0_level_0,Age,Annual Income (k$),Spending Score (1-100),Gender_Male,label
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,19,15,39,1,0
2,21,15,81,1,0
3,20,16,6,0,0
4,23,16,77,0,0
5,31,17,40,0,0
...,...,...,...,...,...
196,35,120,79,0,2
197,45,126,28,0,1
198,32,126,74,1,2
199,32,137,18,1,1


In [130]:
result_df.groupby('label').mean()

Unnamed: 0_level_0,Age,Annual Income (k$),Spending Score (1-100),Gender_Male
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,40.325203,44.154472,49.829268,0.406504
1,40.394737,87.0,18.631579,0.526316
2,32.692308,86.538462,82.128205,0.461538


In [131]:
result_df['label'].value_counts()

label
0    123
2     39
1     38
Name: count, dtype: int64

## Elbow Method

In [132]:
distance = []

for i in range(2, 11):
    model = KMeans(n_clusters=i)
    model.fit(df)
    distance.append(model.inertia_)
distance

[221222.99561403508,
 154627.55913714768,
 104414.67534220163,
 79501.91330865468,
 71549.88328330725,
 51458.29097412091,
 48393.019614256715,
 45060.36398440435,
 39577.067459245416]

In [133]:
import plotly.express as px
px.line(x=list(range(2, 11)), y=distance)

## Silhoutte Score
###### Elbow Method로 최적값을 찾기 어려울 때 사용할 수 있다 

In [134]:
from sklearn.metrics import silhouette_score

In [135]:
silhouette_score(df, model.labels_)

0.37056457780855895

In [136]:
sil = []

for i in range(2, 11):
    model = KMeans(n_clusters=i)
    model.fit(df)
    sil.append(silhouette_score(df, model.labels_))
sil

[0.29307334005502633,
 0.383798873822341,
 0.40553486600451777,
 0.4402372958247734,
 0.45205475380756527,
 0.4039250372729522,
 0.42438984757889825,
 0.4078506439997503,
 0.37060918590721686]

In [137]:
px.line(x=list(range(2, 11)), y=sil)

## 최적의 k값으로 리모델링 

In [138]:
model = KMeans(n_clusters=6)

In [139]:
model.fit(df)

In [140]:
df['label'] = model.labels_

In [141]:
df.groupby('label').mean()

Unnamed: 0_level_0,Age,Annual Income (k$),Spending Score (1-100),Gender_Male
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,56.155556,53.377778,49.088889,0.444444
1,32.692308,86.538462,82.128205,0.461538
2,41.685714,88.228571,17.285714,0.571429
3,27.0,56.657895,49.131579,0.342105
4,25.272727,25.727273,79.363636,0.409091
5,44.142857,25.142857,19.52381,0.380952


In [142]:
px.box(data_frame=df, x='label', y='Age')

In [143]:
px.box(data_frame=df, x='label', y='Annual Income (k$)')