## Customer Segmentation by RFM (Recency, Frequency, Monetary)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

from datetime import datetime
from sklearn.cluster import KMeans
from functools import wraps

sns.set_context('talk')
sns.set_style('whitegrid')
sns.set_palette('husl')

In [2]:
# Read excel file to dataframe
df = pd.read_excel('Online Retail.xlsx')
# Turn letter to lower case for easy coding
df.columns = df.columns.str.lower()

FileNotFoundError: [Errno 2] No such file or directory: 'OnlineRetail.xlsx'

In [None]:
#Subset United Kingdom only 
df = df[df.country == 'United Kingdom']

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.head()

In [None]:
sns.heatmap(df.isnull())

In [None]:
df.loc[df.description.isnull(), 'description'] = 'N/A'

In [None]:
# drop customerid's null values since we are trying to cluster customer, and without customer id
# we won't be able to know who is the customer. We can't impute missing values because this is a
#primary key. 
df.dropna(axis=0, inplace=True)

In [None]:
df.isnull().sum()

### Define Decorator & Class

In [None]:
# Define decorator timer
def timer(func):
    """A decorator that prints how long a function took to run"""
    
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        total = time.time() - start
        print(f'{total:.2f}s runtime on {func.__name__} function')
        return result
    return wrapper

In [None]:
# Define ClusterK class
class Cluster:
    def __init__(self):
        pass
    
    @timer
    def predict(self, n_clusters, data):
        """Use Kmeans to predict cluster for input data"""
        
        kmean = KMeans(n_clusters=n_clusters, max_iter=1000)
        kmean.fit(data)
        predictions = kmean.predict(data)
        return predictions
    
    @timer   
    def elbow(self, data):
        """Elbow method compute number of clusters and distance
            Return:
            INT(number of cluster)
            INT(distance)       
        """
        distance = []
        k_range = range(1,10)
        for k in k_range:
            km = KMeans(n_clusters=k, max_iter=1000).fit(data)           
            distance.append(km.inertia_)
        return k_range, distance
    
    def plot_elbow(self, k_range, distance):
        """Plot elbow on lineplot"""
        
        sns.lineplot(k_range, distance, marker='o')
        plt.xlabel('Number of Cluster'); plt.ylabel('Distance')
        plt.title('Elbow Method for Optimal K')
        plt.show()
        
    @timer   
    def cluster_order(self, cluster_name, target_name, df, ascending):
        """ Re-order cluster labels so labels looks neat and easy to understand.
        Cluster 0 would be most favourable
        """
        
        df_new = df.groupby(cluster_name)[target_name].mean().reset_index()
        df_new = df_new.sort_values(target_name, ascending=ascending).reset_index(drop=True)
        df_new['index'] = df_new.index
        df_final = pd.merge(df, df_new[[cluster_name, 'index']], on=cluster_name)
        df_final.drop(cluster_name, axis=1, inplace=True)
        df_final.rename(columns={'index': cluster_name}, inplace=True)
        return df_final

## 1. Recency

In [None]:
# Create new dataframe by unique customer id
user = pd.DataFrame(df['customerid'].unique(), columns=['customer_id'])

# Subset recent invoice date by customer id as their purchase history
purchase_history = df.groupby('customerid').invoicedate.max().reset_index()
purchase_history.columns = ['customer_id', 'recent_purchase_date']

# Compute number of days since their last purchase from recent date
purchase_history['recency'] = (purchase_history['recent_purchase_date'].max() - purchase_history['recent_purchase_date']).dt.days

# Merge user & purchase_history
user = user.merge(purchase_history[['customer_id', 'recency']], on='customer_id')

In [None]:
# customer #17850 last purchase was 301 days from the most recent invoice date
# Likewise, customer #12583 purchased a product 2 days ago from the most recent invoice date
user.head()

In [None]:
user.describe().T

The average recency is 91 days. However, the median is 49 days. This indicated a skewed distribution

In [None]:
plt.figure(figsize=(10,8))
sns.distplot(user.recency, rug=True)
plt.ylabel('Frequency')
plt.title('Recency')

In [None]:
# Instantiate user's recency for cluster
recency_df = user[['recency']]

In [None]:
# Instantiate cluster
cluster = Cluster()
k_range, distance = cluster.elbow(recency_df)

In [None]:
sns.set_style('white')
plt.figure(figsize=(8,6))

cluster.plot_elbow(k_range, distance)

3 clusters seem to be our optimal. However, we will use 4 clusters

In [None]:
# Predict and assign cluster for each customers
recency_preds = cluster.predict(4, recency_df)

user['recency_cluster'] = recency_preds
user.groupby('recency_cluster')['recency'].describe()

In [None]:
# Use cluster order function to re-order cluster for easy view
user_df = cluster.cluster_order('recency_cluster', 'recency', user, False)
user_df.groupby('recency_cluster')['recency'].describe()

the order of cluster_name has reflected its attritube from 0 as inactive to 3 as active. 

In [None]:
user_df

In [None]:
# Visulise clusters name by customer 
sns.lmplot(x='recency', y='customer_id', hue='recency_cluster', data=user_df, fit_reg=False)

## 2. Frequency

In [None]:
user_frequency = df.groupby('customerid')['invoicedate'].count().reset_index()
user_frequency.columns = ['customer_id', 'frequency']
user_frequency

In [None]:
plt.figure(figsize=(10,8))
sns.distplot(user_frequency.query('frequency < 500')['frequency'], rug=True)

In [None]:
user_df = pd.merge(user_df, user_frequency, on='customer_id')
user_df

In [None]:
user_df.frequency.describe()

In [None]:
f_range, f_distance = cluster.elbow(user_df[['frequency']])
cluster.plot_elbow(f_range, f_distance)

In [None]:
# Predict and assign cluster for each customers
frequency_preds = cluster.predict(4, user_df[['frequency']])

user_df['frequency_cluster'] = frequency_preds
user_df.groupby('frequency_cluster')['frequency'].describe()

In [None]:
user_df = cluster.cluster_order('frequency_cluster', 'frequency', user_df, True)
user_df.groupby('frequency_cluster')['frequency'].describe()

 0 - 3 clusters as similar to recency. 3 is the most favourable as this cluster has the most frequency

## 3. Monetary

In [None]:
df.head(1)

In [None]:
df['revenue'] = df.quantity * df.unitprice
user_revenue = df.groupby('customerid')['revenue'].sum().reset_index()
user_revenue.columns = ['customer_id', 'revenue']
user_revenue

In [None]:
user_df = pd.merge(user_df, user_revenue, on='customer_id')
user_df

In [None]:
plt.figure(figsize=(10,8))
sns.distplot(user_df.query('revenue<10000')['revenue'], rug=True)

There is negative values for revenue. an indication of refund or product return. 

In [None]:
revenue_preds = cluster.predict(4, user_df[['revenue']])

user_df['revenue_cluster'] = revenue_preds
user_df.groupby('revenue_cluster')['revenue'].describe()

In [None]:
user_df = cluster.cluster_order('revenue_cluster', 'revenue', user_df, True)

In [None]:
user_df.groupby('revenue_cluster')['revenue'].describe()

We have re-ordered the cluster again as similar to above method with cluster 3 is the most favourable in term of monetary

In [None]:
user_df.head()

## Overall Score

we have created 3 type of cluster (recency, frequency, monetary) with the higher value of cluster the better/favourable customer. Therefore, we can add these cluster value together to show the top customer for online retail dataset. The customer with the highest score indicated recent purchase, high purchase frequency(repeated purchase) and top spending.    

In [None]:
user_df['score'] = user_df['recency_cluster'] + user_df['frequency_cluster'] + user_df['revenue_cluster']

In [None]:
user_df.groupby('score')['recency', 'frequency', 'revenue'].mean()

In [None]:
user_df['segment'] = 'Low-Value'
user_df.loc[user_df.score > 2, 'segment'] = 'Mid-Value'
user_df.loc[user_df.score > 4, 'segment'] = 'High-Value'

In [None]:
user_df

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
sns.set_style('darkgrid')

ax.scatter(np.log(user_df.query('segment == "High-Value"')['frequency']), np.log(user_df.query('segment == "High-Value"')['revenue']+1), marker='>', alpha=0.5, label='High')
ax.scatter(np.log(user_df.query('segment == "Mid-Value"')['frequency']), np.log(user_df.query('segment == "Mid-Value"')['revenue']+1), marker='o', alpha=0.5, label='Mid')
ax.scatter(np.log(user_df.query('segment == "Low-Value"')['frequency']), np.log(user_df.query('segment == "Low-Value"')['revenue']+1), marker='+', alpha=0.5, label='Low')
ax.set_xlabel('Frequency')
ax.set_ylabel('Revenue')
fig.suptitle('Frequency vs Revenue by Customer Segmentation')

plt.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
sns.set_style('darkgrid')

ax.scatter(np.log(user_df.query('segment == "High-Value"')['recency']), np.log(user_df.query('segment == "High-Value"')['revenue']+1), marker='>', alpha=0.5, label='High')
ax.scatter(np.log(user_df.query('segment == "Mid-Value"')['recency']), np.log(user_df.query('segment == "Mid-Value"')['revenue']+1), marker='o', alpha=0.5, label='Mid')
ax.scatter(np.log(user_df.query('segment == "Low-Value"')['recency']), np.log(user_df.query('segment == "Low-Value"')['revenue']+1), marker='+', alpha=0.5, label='Low')
ax.set_xlabel('Recency log(days)')
ax.set_ylabel('Revenue log($)')
fig.suptitle('Frequency vs Revenue by Customer Segmentation')

plt.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
sns.set_style('darkgrid')

ax.scatter(np.log(user_df.query('segment == "High-Value"')['recency']), np.log(user_df.query('segment == "High-Value"')['frequency']+1), marker='>', alpha=0.5, label='High')
ax.scatter(np.log(user_df.query('segment == "Mid-Value"')['recency']), np.log(user_df.query('segment == "Mid-Value"')['frequency']+1), marker='o', alpha=0.5, label='Mid')
ax.scatter(np.log(user_df.query('segment == "Low-Value"')['recency']), np.log(user_df.query('segment == "Low-Value"')['frequency']+1), marker='+', alpha=0.5, label='Low')
ax.set_xlabel('Recency log(days)')
ax.set_ylabel('Frequency')
fig.suptitle('Frequency vs Revenue by Customer Segmentation')

plt.legend()
plt.show()

In [None]:
user_df_t = user_df[['score', 'segment', 'customer_id']]
user_df_t

In [None]:
df

In [None]:
csv = df.merge(user_df_t, left_on='customerid', right_on='customer_id', how='left')

In [None]:
csv.drop('customer_id', axis=1, inplace=True)

In [None]:
csv.head()

In [None]:
csv.to_csv('online_retail_rfmscore.csv', index=False)