In [1]:
import pandas as pd

df = pd.read_csv('./data/ecommerce_data_final.csv')

df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,date_ymd,year,amount
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,2010-12-01,2010,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,2010-12-01,2010,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,2010-12-01,2010,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,2010-12-01,2010,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,2010-12-01,2010,20.34


---
- Recency (최근성) : 고객이 얼마나 최근에 구매를 했는지
- Frequeny (빈도) : 고객이 얼마나 자주 구매를 했는지
- Monetary (금액) : 고객이 구매한 총 금액

해당 분석에서는 Recency와 Monetary 두 요소를 가지고 RM분석 진행

---

In [12]:
# 가장 최근 방문일자

today_date = max(df['date_ymd'])
today_date

Timestamp('2011-12-09 00:00:00')

In [9]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['date_ymd'] = pd.to_datetime(df['date_ymd'])

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397924 entries, 0 to 397923
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    397924 non-null  int64         
 1   StockCode    397924 non-null  object        
 2   Description  397924 non-null  object        
 3   Quantity     397924 non-null  int64         
 4   InvoiceDate  397924 non-null  datetime64[ns]
 5   UnitPrice    397924 non-null  float64       
 6   CustomerID   397924 non-null  int64         
 7   Country      397924 non-null  object        
 8   date_ymd     397924 non-null  datetime64[ns]
 9   year         397924 non-null  int64         
 10  amount       397924 non-null  float64       
dtypes: datetime64[ns](2), float64(2), int64(4), object(3)
memory usage: 33.4+ MB


In [14]:
rfm = df.groupby('CustomerID').agg({'InvoiceDate' : lambda x: (today_date - x.max()).days,    # 최근 방문일로부터 몇일이 지났는지
                                    'amount' : lambda x: x.sum()})                            # 주문금액

rfm.columns = ['recency', 'monetary']
rfm.head()

Unnamed: 0_level_0,recency,monetary
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1
12346,324,77183.6
12347,1,4310.0
12348,74,1797.24
12349,17,1757.55
12350,309,334.4


각 팩터를 5등급으로 나누어 등급화

In [16]:
rfm['recency_score'] = pd.qcut(rfm['recency'], 3, labels = [3, 2, 1])
rfm['monetary_score'] = pd.qcut(rfm['monetary'], 3, labels = [1, 2, 3])
rfm['rm_score'] = rfm['recency_score'].astype(str) + rfm['monetary_score'].astype(str)

rfm.reset_index(inplace = True)
rfm.head()

Unnamed: 0,CustomerID,recency,monetary,recency_score,monetary_score,rm_score
0,12346,324,77183.6,1,3,13
1,12347,1,4310.0,3,3,33
2,12348,74,1797.24,2,3,23
3,12349,17,1757.55,3,3,33
4,12350,309,334.4,1,1,11


In [20]:
# 등급별 고객 수 현황
rm_score = rfm.groupby('rm_score')[['CustomerID']].nunique().reset_index().rename(columns = {'CustomerID' : 'customer_count'})
rm_score

Unnamed: 0,rm_score,customer_count
0,11,790
1,12,481
2,13,170
3,21,407
4,22,546
5,23,457
6,31,250
7,32,419
8,33,819


등급을 통해 고객 분류

In [22]:
def categorize_customer(score):
    if score == '33':
        return '최우수'     # 최신성, 구매 모두 상당히 높음
    elif score in ['32', '23', '22']:
        return '우수'       # 최신성, 구매 모두 높음
    elif score == '11':
        return '휴면'       # 최신성, 구매 모두 낮음
    elif score in ['12', '13']:
        return '이탈방지'   # 구매는 높으나 최신성은 낮음 -> 다시 불러들어야 함
    elif score in ['31', '21']:
        return '구매 유도'  # 최신성은 높으나 구매가 낮음 -> 구매를 유도해야 함
    
rm_score['category'] = rm_score['rm_score'].apply(categorize_customer)
rm_score

Unnamed: 0,rm_score,customer_count,category
0,11,790,휴면
1,12,481,이탈방지
2,13,170,이탈방지
3,21,407,구매 유도
4,22,546,우수
5,23,457,우수
6,31,250,구매 유도
7,32,419,우수
8,33,819,최우수


시각화

In [24]:
import plotly.express as px

fig = px.treemap(data_frame = rm_score, path = ['category'], values = 'customer_count', color_discrete_sequence = px.colors.qualitative.Pastel1)
fig.show()