In [1]:
import pandas as pd
from datetime import timedelta
import numpy as np

# Read datasets in
> Ensure that zipcdes are read as strings and not numeric

In [2]:
# Load the customer dataset
file_path = "data/Customer_sales.csv"
df_customer_sales = pd.read_csv(file_path,dtype={"BillPostalCode":"string"} )
df_customer_sales
df_customer_sales["CustomerID"].nunique()

141461

In [3]:
# Group by order and customer - total sales
Orders_df = df_customer_sales.groupby(['ChannelOrderID','OrderDate','CustomerID']).agg({'Total': lambda x:x.sum()}).reset_index()
Orders_df

Unnamed: 0,ChannelOrderID,OrderDate,CustomerID,Total
0,4196075,2020-12-31,240414012,39.99
1,4196093,2020-12-31,240413012,250.00
2,4196116,2020-12-31,240411012,19.99
3,4196117,2020-12-31,229366012,49.98
4,4196129,2020-12-31,240475012,129.99
...,...,...,...,...
205376,5623963,2023-01-25,375206012,16.99
205377,5623964,2023-01-25,375205012,4.99
205378,5623979,2023-01-25,375204012,3299.99
205379,5623980,2023-01-25,375203012,48.99


In [4]:
# Convert orderdate to datetime type so calculations can be done
Orders_df['OrderDate'] = Orders_df['OrderDate'].astype('datetime64[ns]')

In [5]:
Orders_df.dtypes

ChannelOrderID             int64
OrderDate         datetime64[ns]
CustomerID                 int64
Total                    float64
dtype: object

In [6]:
# Set variables for NOW and and period
NOW = Orders_df['OrderDate'].max() + timedelta(days=1)
NOW

Timestamp('2023-01-26 00:00:00')

In [7]:
# Data is for a two year period
period = 730

In [8]:
# Add column for Days since order to aid with calcuation
Orders_df['DaysSinceOrder'] = Orders_df['OrderDate'].apply(lambda  x: (NOW -x).days)

In [9]:
aggr = {
    'DaysSinceOrder' : lambda x : x.min(),
    'OrderDate' : lambda x: len([d for d in x if d>=NOW - timedelta(days=period)])
}

rfm_df = Orders_df.groupby('CustomerID').agg(aggr).reset_index()
rfm_df.rename(columns={'DaysSinceOrder' : 'Recency', 'OrderDate' : 'Frequency'}, inplace=True)
rfm_df

Unnamed: 0,CustomerID,Recency,Frequency
0,10012,636,3
1,24012,50,11
2,36012,48,7
3,48012,15,42
4,49012,43,14
...,...,...,...
141456,375213012,1,1
141457,375214012,1,1
141458,375215012,1,1
141459,375216012,1,1


In [10]:
# rfm_df['Monetary'] = rfm_df['CustomerID'].apply(lambda x: Orders_df[('CustomerID'] == x) & Orders_df
# ['OrderDate'] >= NOW - timedelta(days=period)['Total'].sum())




rfm_df['Monetary'] = rfm_df['CustomerID'].apply(lambda x: Orders_df[(Orders_df['CustomerID'] == x) &
                                                           (Orders_df['OrderDate'] >= NOW - timedelta(days=period))]
                                                           ['Total'].sum())

rfm_df

Unnamed: 0,CustomerID,Recency,Frequency,Monetary
0,10012,636,3,121.98
1,24012,50,11,1152.57
2,36012,48,7,268.83
3,48012,15,42,7014.86
4,49012,43,14,2460.47
...,...,...,...,...
141456,375213012,1,1,25.00
141457,375214012,1,1,60.00
141458,375215012,1,1,16.99
141459,375216012,1,1,24.94


In [11]:
quintiles = rfm_df[['Recency', 'Frequency', 'Monetary']].quantile([.2, .4, .6, .8]).to_dict()
quintiles

{'Recency': {0.2: 142.0, 0.4: 285.0, 0.6: 449.0, 0.8: 598.0},
 'Frequency': {0.2: 1.0, 0.4: 1.0, 0.6: 1.0, 0.8: 1.0},
 'Monetary': {0.2: 18.99, 0.4: 36.99, 0.6: 71.99, 0.8: 172.94}}

In [12]:
def r_score(x):
    if x <= quintiles['Recency'][.2]:
        return 5
    elif x <= quintiles['Recency'][.4]:
        return 4
    elif x <= quintiles['Recency'][.6]:
        return 3
    elif x <= quintiles['Recency'][.8]:
        return 2
    else:
        return 1

def fm_score(x, c):
    if x <= quintiles[c][.2]:
        return 1
    elif x <= quintiles[c][.4]:
        return 2
    elif x <= quintiles[c][.6]:
        return 3
    elif x <= quintiles[c][.8]:
        return 4
    else:
        return 5

In [13]:
rfm_df['R'] = rfm_df['Recency'].apply(lambda x: r_score(x))
rfm_df['F'] = rfm_df['Frequency'].apply(lambda x: fm_score(x, 'Frequency'))
rfm_df['M'] = rfm_df['Monetary'].apply(lambda x: fm_score(x, 'Monetary'))

rfm_df

Unnamed: 0,CustomerID,Recency,Frequency,Monetary,R,F,M
0,10012,636,3,121.98,1,5,4
1,24012,50,11,1152.57,5,5,5
2,36012,48,7,268.83,5,5,5
3,48012,15,42,7014.86,5,5,5
4,49012,43,14,2460.47,5,5,5
...,...,...,...,...,...,...,...
141456,375213012,1,1,25.00,5,1,2
141457,375214012,1,1,60.00,5,1,3
141458,375215012,1,1,16.99,5,1,1
141459,375216012,1,1,24.94,5,1,2


In [14]:
rfm_df['RFM Score'] = rfm_df['R'].map(str) + rfm_df['F'].map(str) + rfm_df['M'].map(str)
rfm_df.head()

Unnamed: 0,CustomerID,Recency,Frequency,Monetary,R,F,M,RFM Score
0,10012,636,3,121.98,1,5,4,154
1,24012,50,11,1152.57,5,5,5,555
2,36012,48,7,268.83,5,5,5,555
3,48012,15,42,7014.86,5,5,5,555
4,49012,43,14,2460.47,5,5,5,555


In [15]:
segt_map = {
    r'[1-2][1-2]': 'hibernating',
    r'[1-2][3-4]': 'at risk',
    r'[1-2]5': 'can\'t loose',
    r'3[1-2]': 'about to sleep',
    r'33': 'need attention',
    r'[3-4][4-5]': 'loyal customers',
    r'41': 'promising',
    r'51': 'new customers',
    r'[4-5][2-3]': 'potential loyalists',
    r'5[4-5]': 'champions'
}

rfm_df['Segment'] = rfm_df['R'].map(str) + rfm_df['F'].map(str)
rfm_df['Segment'] = rfm_df['Segment'].replace(segt_map, regex=True)
rfm_df

Unnamed: 0,CustomerID,Recency,Frequency,Monetary,R,F,M,RFM Score,Segment
0,10012,636,3,121.98,1,5,4,154,can't loose
1,24012,50,11,1152.57,5,5,5,555,champions
2,36012,48,7,268.83,5,5,5,555,champions
3,48012,15,42,7014.86,5,5,5,555,champions
4,49012,43,14,2460.47,5,5,5,555,champions
...,...,...,...,...,...,...,...,...,...
141456,375213012,1,1,25.00,5,1,2,512,new customers
141457,375214012,1,1,60.00,5,1,3,513,new customers
141458,375215012,1,1,16.99,5,1,1,511,new customers
141459,375216012,1,1,24.94,5,1,2,512,new customers


In [18]:
rfm_df.to_csv('data/Customer_rfm.csv', index=False)