# RFM Analysis

In [1]:
import math
import numpy as np
import pandas as pd
import datetime as dt
import seaborn as sns
from scipy.stats import norm
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

### Dataset

This is a transactional data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.
	
https://archive.ics.uci.edu/ml/datasets/online+retail

**Attribute Information**:

    InvoiceNo: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.
    StockCode: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.
    Description: Product (item) name. Nominal.
    Quantity: The quantities of each product (item) per transaction. Numeric.
    InvoiceDate: Invice Date and time. Numeric, the day and time when each transaction was generated.
    UnitPrice: Unit price. Numeric, Product price per unit in sterling.
    CustomerID: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.
    Country: Country name. Nominal, the name of the country where each customer resides. 

In [3]:
df = pd.read_excel('Online Retail.xlsx')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [None]:
df['CustomerID'] = df['CustomerID'].astype(str)
df['InvoiceNo'] = df['InvoiceNo'].astype(str)

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
# get the missing data
df.isnull().sum()

### Data Cleanup

In [None]:
# delete the missing data
df.dropna(inplace=True)

In [None]:
#Remove spaces  
df['Description'] = df['Description'].str.strip()

#drop the rows that don’t have invoice numbers
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)

#remove the credit transactions (those with invoice numbers containing C).
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('C')]

## EDA

In [None]:
#Number of customers
df.CustomerID.unique().shape

In [None]:
#Number of stockcodes
df.StockCode.unique().shape

In [None]:
#transcations by country
df.groupby(['Country'])['InvoiceNo'].count().sort_values(ascending=False) 

In [None]:
(df.CustomerID.value_counts()/sum(df.CustomerID.value_counts())*100).head(n=13).cumsum()

## RFM Analysis
Calculation of RFM metrics (recency, frequency, and monetary)
    
    recency = date of analysis - purchase date of the relevant customer
    frequency = customer's total number of purchases
    monetary = total monetary value as a result of the customer's total purchases
    
We will assume the date of analysis was 2 days after the max(date) in the dataset

In [None]:
# TotalPrice column
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
df.sort_values(by='TotalPrice', ascending=False).head()

In [None]:
# get the total price paid per each invoice
df.groupby('InvoiceNo').agg({'TotalPrice': 'sum'}).head().sort_values(by='TotalPrice', ascending=False)

In [None]:
# get the last invoice date in the dataset
max_date = df['InvoiceDate'].max()
max_date

In [None]:
# adding a day to the calculated max date as date of analysis
snapshot = max_date + dt.timedelta(days = 1)
snapshot

In [None]:
# group the customer id's to see every single customer's activity on r, f , m
customer_group = df.groupby("CustomerID") 
customer_group.head(2)

In [None]:
# the last day of grouped customer's transaction is captured with .max()
recency = (snapshot - customer_group["InvoiceDate"].max()) 

# the first day of grouped customer's transaction is captured with .min()
tenure = snapshot - customer_group["InvoiceDate"].min() 

# how many times the customer made transactions?
frequency = customer_group["InvoiceNo"].nunique() 

#how much total money has this customer spent
monetary = customer_group["TotalPrice"].sum()

In [None]:
# Create RFM table
# Groupby customer ID and aggregate on r, f, m,and t values

rfm = pd.DataFrame() 
rfm["Recency"] = recency.dt.days 
rfm["Frequency"] = frequency
rfm["Monetary"] = monetary
rfm["Tenure"] = tenure.dt.days 
rfm.head()

In [None]:
#recency
rfm.Recency.describe()

In [None]:
#Recency plot
x = rfm.Recency
mu = np.mean(rfm.Recency)
sigma = math.sqrt(np.var(rfm.Recency))
n, bins, patches = plt.hist(x, 400, facecolor='green', alpha=0.75)
plt.xlabel('Recency in days')
plt.ylabel('Number of transactions')
plt.title(r'Histogram of sales recency')
plt.grid(True)

In [None]:
#scale all RFMT values in the range 1-5
min_max_scaler = MinMaxScaler((1,5))
scaled = min_max_scaler.fit_transform(rfm[["Recency","Frequency","Monetary","Tenure"]])
rfm = pd.DataFrame(scaled, columns=rfm.columns)
rfm.head()

In [None]:
X = rfm.values 
n_cluster = range(2, 11, 1)
cost = []

for i in n_cluster:
    model = KMeans(n_clusters=i, random_state=13)
    model.fit(X)
    cost.append(model.inertia_)
    
fig = plt.figure(figsize=(8, 5))
ax = fig.add_subplot(1,1,1)
ax.plot(n_cluster, cost, marker="o")
ax.set_xlabel("Number of Segments")
ax.set_ylabel("Inertia Value")
plt.show()

In [None]:
#k=3 probably gets better clusters, however the business would like to see 5 groups
n_cluster = 5

model = KMeans(n_clusters=n_cluster, random_state=2023)
y = model.fit_predict(rfm[["Recency","Frequency","Monetary","Tenure"]])

rfm["Segment"] = y
rfm.head() 

In [None]:
#lets take a look at each segment in terms of RFMT
rfm.groupby("Segment").mean()

In [None]:
humanization = {3:"Diamond", 1:"Platinum", 2:"Gold" , 4:"Silver", 0:"Bronze"}
rfm["Segment"].replace(humanization,inplace=True)
rfm.head()

In [None]:
#group by topic and plot rows in the descending order
rfm.groupby('Segment').size().sort_values(ascending=True)\
    .plot(title="Number of Customers", kind='barh',figsize=(5,2))
plt.show()

In [None]:
sns.pairplot(rfm, hue="Segment")
plt.show()

###  RFM Score (Clustering Alternative)
Score = (Recency score x Recency weight) + (Frequency score x Frequency weight) + (Monetary score x Monetary weight).

In [None]:
rfm["Score"] = 0.6*rfm['Recency'] + 0.2*rfm['Frequency'] + 0.2*rfm['Monetary']
rfm.head(2)

In [None]:
std_scaler = StandardScaler()
std_score = std_scaler.fit_transform(rfm[["Score"]])
rfm['Std_Score'] = std_score
rfm.head(2)

In [None]:
rfm.describe()