# 고객 유형 분류
## KMeans Clustering

고객 정보와 수입, 소비 데이터를 활용하여 
KMeans Clustering을 통해 유형을 분류한다. 

## 모듈 및 데이터 로드

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [21]:
df = pd.read_csv('Mall_Customers.csv', index_col = 0)

In [22]:
df.head()

Unnamed: 0_level_0,Gender,Age,Annual Income (k$),Spending Score (1-100)
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Male,19,15,39
2,Male,21,15,81
3,Female,20,16,6
4,Female,23,16,77
5,Female,31,17,40


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Gender                  200 non-null    object
 1   Age                     200 non-null    int64 
 2   Annual Income (k$)      200 non-null    int64 
 3   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 7.8+ KB


In [24]:
df.describe()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200.0,200.0
mean,38.85,60.56,50.2
std,13.969007,26.264721,25.823522
min,18.0,15.0,1.0
25%,28.75,41.5,34.75
50%,36.0,61.5,50.0
75%,49.0,78.0,73.0
max,70.0,137.0,99.0


## 카테고리 변수 처리

In [25]:
df = pd.get_dummies(df, columns=['Gender'], drop_first=True)

In [26]:
df

Unnamed: 0_level_0,Age,Annual Income (k$),Spending Score (1-100),Gender_Male
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,19,15,39,True
2,21,15,81,True
3,20,16,6,False
4,23,16,77,False
5,31,17,40,False
...,...,...,...,...
196,35,120,79,False
197,45,126,28,False
198,32,126,74,True
199,32,137,18,True


## KMeans Clustering 모델링

In [27]:
from sklearn.cluster import KMeans

In [28]:
model = KMeans(n_clusters=3)

In [29]:
model.fit(df)

In [30]:
model.labels_

array([2, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0, 2, 0,
       2, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2,
       2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0,
       2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0,
       0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2,
       2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 0, 2, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0])

## 분석 결과 Summary

In [31]:
result_df = df.copy()

In [32]:
result_df['label'] = model.labels_

In [33]:
result_df

Unnamed: 0_level_0,Age,Annual Income (k$),Spending Score (1-100),Gender_Male,label
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,19,15,39,True,2
2,21,15,81,True,0
3,20,16,6,False,2
4,23,16,77,False,0
5,31,17,40,False,2
...,...,...,...,...,...
196,35,120,79,False,0
197,45,126,28,False,1
198,32,126,74,True,0
199,32,137,18,True,1


In [34]:
result_df.groupby('label').mean()

Unnamed: 0_level_0,Age,Annual Income (k$),Spending Score (1-100),Gender_Male
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,28.779221,64.844156,75.038961,0.441558
1,40.394737,87.0,18.631579,0.526316
2,47.282353,44.858824,41.811765,0.4


In [35]:
result_df['label'].value_counts()

label
2    85
0    77
1    38
Name: count, dtype: int64

## Elbow Method

In [36]:
distance = []

for i in range(2, 11):
    model = KMeans(n_clusters=i)
    model.fit(df)
    distance.append(model.inertia_)
distance

[219862.21754385968,
 143391.5923603568,
 105448.58764224753,
 75542.77371510217,
 68229.13969822448,
 55073.268424446585,
 48439.87309357309,
 41775.36646803118,
 38661.80228421366]

In [47]:
import plotly.express as px
px.line(x=list(range(2, 11)), y=distance)