# RFM Analysis - Online Retail Dataset (UCI)

**Dataset**: Online Retail Data Set from UCI Machine Learning Repository

**Source**: https://archive.ics.uci.edu/ml/datasets/online+retail or Kaggle

**Description**: Transactional data from a UK-based online retail company (2010-2011)

**Complexity**: Medium

## Dataset Information
- **Records**: ~540,000 transactions
- **Customers**: ~4,300 unique customers
- **Time Period**: Dec 2010 - Dec 2011
- **Attributes**: InvoiceNo, StockCode, Description, Quantity, InvoiceDate, UnitPrice, CustomerID, Country

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load and Explore Data

In [None]:
# Load data
# Download from: https://www.kaggle.com/datasets/jihyeseo/online-retail-data-set-from-uci-ml-repo
# or: https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx

df = pd.read_excel('Online Retail.xlsx', sheet_name='Online Retail')
# Alternative if CSV:
# df = pd.read_csv('online_retail.csv', encoding='ISO-8859-1')

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names and types:")
print(df.dtypes)
print(f"\nFirst few rows:")
df.head()

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())
print(f"\nPercentage of missing CustomerID: {df['CustomerID'].isnull().sum() / len(df) * 100:.2f}%")

In [None]:
# Basic statistics
print("Dataset Summary:")
print(f"Total Transactions: {len(df):,}")
print(f"Unique Customers: {df['CustomerID'].nunique():,}")
print(f"Unique Products: {df['StockCode'].nunique():,}")
print(f"Date Range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")
print(f"Countries: {df['Country'].nunique()}")

## 2. Data Cleaning and Preprocessing

In [None]:
# Create a copy for cleaning
df_clean = df.copy()

# Remove rows with missing CustomerID
df_clean = df_clean[df_clean['CustomerID'].notna()]
print(f"After removing missing CustomerID: {len(df_clean):,} rows")

# Remove cancelled orders (InvoiceNo starting with 'C')
df_clean = df_clean[~df_clean['InvoiceNo'].astype(str).str.startswith('C')]
print(f"After removing cancellations: {len(df_clean):,} rows")

# Remove negative or zero quantities and prices
df_clean = df_clean[(df_clean['Quantity'] > 0) & (df_clean['UnitPrice'] > 0)]
print(f"After removing invalid quantities/prices: {len(df_clean):,} rows")

# Calculate total amount
df_clean['TotalAmount'] = df_clean['Quantity'] * df_clean['UnitPrice']

# Remove outliers (optional - amounts > 99.9th percentile)
amount_threshold = df_clean['TotalAmount'].quantile(0.999)
df_clean = df_clean[df_clean['TotalAmount'] <= amount_threshold]
print(f"After removing outliers: {len(df_clean):,} rows")

print(f"\nFinal dataset: {len(df_clean):,} rows, {df_clean['CustomerID'].nunique():,} customers")

In [None]:
# Convert CustomerID to integer
df_clean['CustomerID'] = df_clean['CustomerID'].astype(int)

# Ensure InvoiceDate is datetime
df_clean['InvoiceDate'] = pd.to_datetime(df_clean['InvoiceDate'])

# Check data quality
df_clean.describe()

## 3. RFM Calculation

In [None]:
# Set analysis date (day after last transaction)
analysis_date = df_clean['InvoiceDate'].max() + timedelta(days=1)
print(f"Analysis Date: {analysis_date}")

# Calculate RFM metrics
rfm = df_clean.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (analysis_date - x.max()).days,  # Recency
    'InvoiceNo': 'nunique',  # Frequency (unique invoices)
    'TotalAmount': 'sum'  # Monetary
}).reset_index()

rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

print(f"RFM Table Shape: {rfm.shape}")
rfm.head(10)

In [None]:
# RFM Statistics
print("RFM Metrics Summary:")
print(rfm[['Recency', 'Frequency', 'Monetary']].describe())

## 4. RFM Scoring (Quintile-based)

In [None]:
# Create RFM scores (1-5) using quintiles
# Lower recency is better, so we reverse the labels
rfm['R_Score'] = pd.qcut(rfm['Recency'], q=5, labels=[5, 4, 3, 2, 1], duplicates='drop')
rfm['F_Score'] = pd.qcut(rfm['Frequency'].rank(method='first'), q=5, labels=[1, 2, 3, 4, 5], duplicates='drop')
rfm['M_Score'] = pd.qcut(rfm['Monetary'], q=5, labels=[1, 2, 3, 4, 5], duplicates='drop')

# Convert to integer
rfm['R_Score'] = rfm['R_Score'].astype(int)
rfm['F_Score'] = rfm['F_Score'].astype(int)
rfm['M_Score'] = rfm['M_Score'].astype(int)

# Create combined RFM score
rfm['RFM_Score'] = rfm['R_Score'].astype(str) + rfm['F_Score'].astype(str) + rfm['M_Score'].astype(str)

# Calculate RFM total score
rfm['RFM_Total'] = rfm['R_Score'] + rfm['F_Score'] + rfm['M_Score']

rfm.head(10)

## 5. Customer Segmentation

In [None]:
def segment_customers(df):
    """
    Segment customers based on RFM scores
    """
    segments = []
    
    for _, row in df.iterrows():
        r, f, m = row['R_Score'], row['F_Score'], row['M_Score']
        
        if r >= 4 and f >= 4 and m >= 4:
            segment = 'Champions'
        elif r >= 3 and f >= 4:
            segment = 'Loyal Customers'
        elif r >= 4 and 2 <= f <= 3:
            segment = 'Potential Loyalists'
        elif r >= 4 and f <= 2:
            segment = 'New Customers'
        elif 3 <= r <= 4 and f <= 2:
            segment = 'Promising'
        elif r >= 3 and f >= 3 and m >= 3:
            segment = 'Need Attention'
        elif 2 <= r <= 3:
            segment = 'About to Sleep'
        elif r <= 2 and f >= 4 and m >= 4:
            segment = 'At Risk'
        elif r <= 1 and f >= 4 and m >= 4:
            segment = "Can't Lose Them"
        elif r <= 2 and f <= 2:
            segment = 'Hibernating'
        else:
            segment = 'Lost'
        
        segments.append(segment)
    
    return segments

rfm['Segment'] = segment_customers(rfm)

# Display segment distribution
print("Customer Segment Distribution:")
segment_counts = rfm['Segment'].value_counts().sort_values(ascending=False)
print(segment_counts)
print(f"\nTotal Customers: {len(rfm):,}")

## 6. Visualizations

In [None]:
# RFM Distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].hist(rfm['Recency'], bins=50, color='skyblue', edgecolor='black')
axes[0].set_xlabel('Recency (days)', fontsize=12)
axes[0].set_ylabel('Number of Customers', fontsize=12)
axes[0].set_title('Recency Distribution', fontsize=14, fontweight='bold')
axes[0].axvline(rfm['Recency'].mean(), color='red', linestyle='--', label=f'Mean: {rfm["Recency"].mean():.0f}')
axes[0].legend()

axes[1].hist(rfm['Frequency'], bins=50, color='lightgreen', edgecolor='black')
axes[1].set_xlabel('Frequency (# of purchases)', fontsize=12)
axes[1].set_ylabel('Number of Customers', fontsize=12)
axes[1].set_title('Frequency Distribution', fontsize=14, fontweight='bold')
axes[1].axvline(rfm['Frequency'].mean(), color='red', linestyle='--', label=f'Mean: {rfm["Frequency"].mean():.1f}')
axes[1].legend()

axes[2].hist(rfm['Monetary'], bins=50, color='salmon', edgecolor='black')
axes[2].set_xlabel('Monetary (£)', fontsize=12)
axes[2].set_ylabel('Number of Customers', fontsize=12)
axes[2].set_title('Monetary Distribution', fontsize=14, fontweight='bold')
axes[2].axvline(rfm['Monetary'].mean(), color='red', linestyle='--', label=f'Mean: £{rfm["Monetary"].mean():.0f}')
axes[2].legend()

plt.tight_layout()
plt.show()

In [None]:
# Segment Distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
segment_counts.plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_xlabel('Number of Customers', fontsize=12)
axes[0].set_ylabel('Segment', fontsize=12)
axes[0].set_title('Customer Distribution by Segment', fontsize=14, fontweight='bold')

# Add value labels
for i, v in enumerate(segment_counts.values):
    axes[0].text(v + 10, i, str(v), va='center')

# Pie chart
colors = plt.cm.Set3(range(len(segment_counts)))
axes[1].pie(segment_counts.values, labels=segment_counts.index, autopct='%1.1f%%', 
            startangle=90, colors=colors)
axes[1].set_title('Segment Percentage Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# RFM Score Distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

rfm['R_Score'].value_counts().sort_index().plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_xlabel('R Score', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_title('Recency Score Distribution', fontsize=14, fontweight='bold')
axes[0].tick_params(axis='x', rotation=0)

rfm['F_Score'].value_counts().sort_index().plot(kind='bar', ax=axes[1], color='lightgreen')
axes[1].set_xlabel('F Score', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].set_title('Frequency Score Distribution', fontsize=14, fontweight='bold')
axes[1].tick_params(axis='x', rotation=0)

rfm['M_Score'].value_counts().sort_index().plot(kind='bar', ax=axes[2], color='salmon')
axes[2].set_xlabel('M Score', fontsize=12)
axes[2].set_ylabel('Count', fontsize=12)
axes[2].set_title('Monetary Score Distribution', fontsize=14, fontweight='bold')
axes[2].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Scatter plots - RFM relationships
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].scatter(rfm['Recency'], rfm['Frequency'], alpha=0.3, c='steelblue')
axes[0].set_xlabel('Recency (days)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Recency vs Frequency', fontsize=14, fontweight='bold')

axes[1].scatter(rfm['Recency'], rfm['Monetary'], alpha=0.3, c='darkgreen')
axes[1].set_xlabel('Recency (days)', fontsize=12)
axes[1].set_ylabel('Monetary (£)', fontsize=12)
axes[1].set_title('Recency vs Monetary', fontsize=14, fontweight='bold')

axes[2].scatter(rfm['Frequency'], rfm['Monetary'], alpha=0.3, c='darkred')
axes[2].set_xlabel('Frequency', fontsize=12)
axes[2].set_ylabel('Monetary (£)', fontsize=12)
axes[2].set_title('Frequency vs Monetary', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# 3D Scatter plot
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

# Color by segment
segments_unique = rfm['Segment'].unique()
colors_map = {seg: plt.cm.tab10(i) for i, seg in enumerate(segments_unique)}
colors = [colors_map[seg] for seg in rfm['Segment']]

ax.scatter(rfm['Recency'], rfm['Frequency'], rfm['Monetary'], 
           c=colors, alpha=0.6, s=50)

ax.set_xlabel('Recency (days)', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
ax.set_zlabel('Monetary (£)', fontsize=12)
ax.set_title('3D RFM Visualization', fontsize=14, fontweight='bold')

plt.show()

## 7. Segment Analysis

In [None]:
# Segment summary statistics
segment_summary = rfm.groupby('Segment').agg({
    'Recency': ['mean', 'median'],
    'Frequency': ['mean', 'median'],
    'Monetary': ['mean', 'median', 'sum'],
    'CustomerID': 'count'
}).round(2)

segment_summary.columns = ['_'.join(col).strip() for col in segment_summary.columns.values]
segment_summary = segment_summary.rename(columns={'CustomerID_count': 'Customer_Count'})

# Sort by monetary sum
segment_summary = segment_summary.sort_values('Monetary_sum', ascending=False)

print("Segment Summary Statistics:")
segment_summary

In [None]:
# Revenue contribution by segment
segment_revenue = rfm.groupby('Segment')['Monetary'].sum().sort_values(ascending=False)
segment_revenue_pct = (segment_revenue / segment_revenue.sum() * 100).round(2)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Revenue by segment
segment_revenue.plot(kind='barh', ax=axes[0], color='darkgreen')
axes[0].set_xlabel('Total Revenue (£)', fontsize=12)
axes[0].set_ylabel('Segment', fontsize=12)
axes[0].set_title('Revenue by Customer Segment', fontsize=14, fontweight='bold')

# Add value labels
for i, v in enumerate(segment_revenue.values):
    axes[0].text(v + 5000, i, f'£{v:,.0f}', va='center')

# Revenue percentage
segment_revenue_pct.plot(kind='barh', ax=axes[1], color='steelblue')
axes[1].set_xlabel('Revenue %', fontsize=12)
axes[1].set_ylabel('Segment', fontsize=12)
axes[1].set_title('Revenue Contribution by Segment (%)', fontsize=14, fontweight='bold')

# Add value labels
for i, v in enumerate(segment_revenue_pct.values):
    axes[1].text(v + 0.5, i, f'{v:.1f}%', va='center')

plt.tight_layout()
plt.show()

In [None]:
# Average metrics by segment - heatmap
segment_avg = rfm.groupby('Segment')[['Recency', 'Frequency', 'Monetary']].mean()

plt.figure(figsize=(10, 8))
sns.heatmap(segment_avg.T, annot=True, fmt='.1f', cmap='RdYlGn_r', cbar_kws={'label': 'Value'})
plt.title('Average RFM Metrics by Segment (Heatmap)', fontsize=14, fontweight='bold')
plt.xlabel('Segment', fontsize=12)
plt.ylabel('Metric', fontsize=12)
plt.tight_layout()
plt.show()

## 8. Top Customers Analysis

In [None]:
# Top 20 customers by monetary value
top_customers = rfm.nlargest(20, 'Monetary')[['CustomerID', 'Recency', 'Frequency', 'Monetary', 'Segment']]
print("Top 20 Customers by Monetary Value:")
top_customers

In [None]:
# Champions analysis
champions = rfm[rfm['Segment'] == 'Champions']
print(f"Champions: {len(champions)} customers")
print(f"Average Monetary Value: £{champions['Monetary'].mean():,.2f}")
print(f"Total Revenue from Champions: £{champions['Monetary'].sum():,.2f}")
print(f"Revenue % from Champions: {champions['Monetary'].sum() / rfm['Monetary'].sum() * 100:.2f}%")

## 9. Business Recommendations

In [None]:
# Calculate key business metrics
total_customers = len(rfm)
total_revenue = rfm['Monetary'].sum()
avg_customer_value = rfm['Monetary'].mean()

# At-risk customers
at_risk = rfm[rfm['Segment'].isin(['At Risk', "Can't Lose Them", 'About to Sleep'])]
at_risk_count = len(at_risk)
at_risk_revenue = at_risk['Monetary'].sum()

# High-value segments
high_value = rfm[rfm['Segment'].isin(['Champions', 'Loyal Customers'])]
high_value_count = len(high_value)
high_value_revenue = high_value['Monetary'].sum()

print("=" * 60)
print("BUSINESS INSIGHTS & RECOMMENDATIONS")
print("=" * 60)
print(f"\n1. CUSTOMER BASE OVERVIEW")
print(f"   - Total Customers: {total_customers:,}")
print(f"   - Total Revenue: £{total_revenue:,.2f}")
print(f"   - Average Customer Value: £{avg_customer_value:,.2f}")

print(f"\n2. HIGH-VALUE CUSTOMERS (Champions + Loyal)")
print(f"   - Count: {high_value_count:,} ({high_value_count/total_customers*100:.1f}%)")
print(f"   - Revenue: £{high_value_revenue:,.2f} ({high_value_revenue/total_revenue*100:.1f}%)")
print(f"   - ACTION: Reward programs, exclusive offers, VIP treatment")

print(f"\n3. AT-RISK CUSTOMERS")
print(f"   - Count: {at_risk_count:,} ({at_risk_count/total_customers*100:.1f}%)")
print(f"   - Revenue at Risk: £{at_risk_revenue:,.2f} ({at_risk_revenue/total_revenue*100:.1f}%)")
print(f"   - ACTION: Re-engagement campaigns, win-back offers, personalized outreach")

print(f"\n4. SEGMENT-SPECIFIC STRATEGIES")
for segment in rfm['Segment'].unique():
    seg_data = rfm[rfm['Segment'] == segment]
    seg_count = len(seg_data)
    seg_revenue = seg_data['Monetary'].sum()
    print(f"\n   {segment}:")
    print(f"   - Customers: {seg_count:,} ({seg_count/total_customers*100:.1f}%)")
    print(f"   - Revenue: £{seg_revenue:,.2f} ({seg_revenue/total_revenue*100:.1f}%)")
    print(f"   - Avg Value: £{seg_data['Monetary'].mean():,.2f}")

print("\n" + "=" * 60)

## 10. Export Results

In [None]:
# Export RFM results
rfm.to_csv('online_retail_rfm_results.csv', index=False)
print("RFM results exported to: online_retail_rfm_results.csv")

# Export segment summary
segment_summary.to_csv('online_retail_segment_summary.csv')
print("Segment summary exported to: online_retail_segment_summary.csv")

## 11. Additional Analysis (Optional)

In [None]:
# Correlation analysis
plt.figure(figsize=(8, 6))
correlation = rfm[['Recency', 'Frequency', 'Monetary']].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('RFM Metrics Correlation', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("Correlation Matrix:")
print(correlation)

In [None]:
# Customer lifetime value (CLV) estimation
# Simple CLV = Monetary * (Frequency / Recency) * Customer Lifespan
# Assuming average customer lifespan of 3 years

rfm['Estimated_CLV'] = rfm['Monetary'] * (rfm['Frequency'] / (rfm['Recency'] + 1)) * 3 * 365

top_clv = rfm.nlargest(10, 'Estimated_CLV')[['CustomerID', 'Segment', 'Recency', 'Frequency', 'Monetary', 'Estimated_CLV']]
print("Top 10 Customers by Estimated CLV:")
top_clv