# RFM Analysis - Online Retail II Dataset

**Dataset**: Online Retail II from UCI Machine Learning Repository

**Source**: https://archive.ics.uci.edu/ml/datasets/Online+Retail+II

**Description**: Updated version with additional years (2009-2011)

**Complexity**: Medium (Advanced with CLV focus)

## Focus Areas
- Standard RFM Analysis
- **Customer Lifetime Value (CLV) Prediction**
- Cohort Analysis
- Churn Prediction Indicators

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

## 1. Load Data

In [None]:
# Download from UCI or use the Excel files
# Two sheets: Year 2009-2010 and Year 2010-2011

# Option 1: Load from separate sheets
df1 = pd.read_excel('online_retail_II.xlsx', sheet_name='Year 2009-2010')
df2 = pd.read_excel('online_retail_II.xlsx', sheet_name='Year 2010-2011')
df = pd.concat([df1, df2], ignore_index=True)

# Option 2: If already combined CSV
# df = pd.read_csv('online_retail_II.csv', encoding='ISO-8859-1')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

## 2. Data Preprocessing

In [None]:
# Standardize column names (Online Retail II might have different names)
column_mapping = {
    'Invoice': 'InvoiceNo',
    'StockCode': 'StockCode',
    'Description': 'Description',
    'Quantity': 'Quantity',
    'InvoiceDate': 'InvoiceDate',
    'Price': 'UnitPrice',
    'Customer ID': 'CustomerID',
    'Country': 'Country'
}

df = df.rename(columns=column_mapping)

# Data cleaning
df_clean = df.copy()

# Remove missing CustomerID
df_clean = df_clean[df_clean['CustomerID'].notna()]

# Remove cancellations
df_clean = df_clean[~df_clean['InvoiceNo'].astype(str).str.startswith('C')]

# Remove invalid quantities and prices
df_clean = df_clean[(df_clean['Quantity'] > 0) & (df_clean['UnitPrice'] > 0)]

# Calculate total amount
df_clean['TotalAmount'] = df_clean['Quantity'] * df_clean['UnitPrice']

# Remove extreme outliers
df_clean = df_clean[df_clean['TotalAmount'] <= df_clean['TotalAmount'].quantile(0.999)]

# Convert dates
df_clean['InvoiceDate'] = pd.to_datetime(df_clean['InvoiceDate'])
df_clean['CustomerID'] = df_clean['CustomerID'].astype(int)

# Extract time features
df_clean['Year'] = df_clean['InvoiceDate'].dt.year
df_clean['Month'] = df_clean['InvoiceDate'].dt.month
df_clean['YearMonth'] = df_clean['InvoiceDate'].dt.to_period('M')

print(f"Clean dataset: {df_clean.shape}")
print(f"Customers: {df_clean['CustomerID'].nunique():,}")
print(f"Date range: {df_clean['InvoiceDate'].min()} to {df_clean['InvoiceDate'].max()}")

## 3. RFM Calculation

In [None]:
# Set analysis date
analysis_date = df_clean['InvoiceDate'].max() + timedelta(days=1)
print(f"Analysis date: {analysis_date}")

# Calculate RFM
rfm = df_clean.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (analysis_date - x.max()).days,
    'InvoiceNo': 'nunique',
    'TotalAmount': 'sum'
}).reset_index()

rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

# Additional features for CLV
# Calculate average order value and purchase interval
customer_stats = df_clean.groupby('CustomerID').agg({
    'TotalAmount': ['sum', 'mean', 'std'],
    'InvoiceDate': ['min', 'max', 'count']
})

customer_stats.columns = ['_'.join(col).strip() for col in customer_stats.columns.values]
customer_stats['customer_lifespan_days'] = (customer_stats['InvoiceDate_max'] - customer_stats['InvoiceDate_min']).dt.days
customer_stats['avg_days_between_purchases'] = customer_stats['customer_lifespan_days'] / (customer_stats['InvoiceDate_count'] - 1)
customer_stats['avg_days_between_purchases'] = customer_stats['avg_days_between_purchases'].replace([np.inf, -np.inf], 0)

rfm = rfm.merge(customer_stats[['TotalAmount_mean', 'TotalAmount_std', 'customer_lifespan_days', 'avg_days_between_purchases']], 
                left_on='CustomerID', right_index=True)

rfm.head(10)

## 4. RFM Scoring

In [None]:
# Quintile-based scoring
rfm['R_Score'] = pd.qcut(rfm['Recency'], q=5, labels=[5, 4, 3, 2, 1], duplicates='drop').astype(int)
rfm['F_Score'] = pd.qcut(rfm['Frequency'].rank(method='first'), q=5, labels=[1, 2, 3, 4, 5], duplicates='drop').astype(int)
rfm['M_Score'] = pd.qcut(rfm['Monetary'], q=5, labels=[1, 2, 3, 4, 5], duplicates='drop').astype(int)

rfm['RFM_Score'] = rfm['R_Score'].astype(str) + rfm['F_Score'].astype(str) + rfm['M_Score'].astype(str)
rfm['RFM_Total'] = rfm['R_Score'] + rfm['F_Score'] + rfm['M_Score']

# Segmentation
def segment_customers(df):
    segments = []
    for _, row in df.iterrows():
        r, f, m = row['R_Score'], row['F_Score'], row['M_Score']
        
        if r >= 4 and f >= 4 and m >= 4:
            segment = 'Champions'
        elif r >= 3 and f >= 4:
            segment = 'Loyal Customers'
        elif r >= 4 and 2 <= f <= 3:
            segment = 'Potential Loyalists'
        elif r >= 4 and f <= 2:
            segment = 'New Customers'
        elif 3 <= r <= 4 and f <= 2:
            segment = 'Promising'
        elif r >= 3 and f >= 3 and m >= 3:
            segment = 'Need Attention'
        elif 2 <= r <= 3:
            segment = 'About to Sleep'
        elif r <= 2 and f >= 4 and m >= 4:
            segment = 'At Risk'
        elif r <= 1 and f >= 4 and m >= 4:
            segment = "Can't Lose Them"
        elif r <= 2 and f <= 2:
            segment = 'Hibernating'
        else:
            segment = 'Lost'
        segments.append(segment)
    return segments

rfm['Segment'] = segment_customers(rfm)
print("Segment Distribution:")
print(rfm['Segment'].value_counts())

## 5. Customer Lifetime Value (CLV) Analysis

In [None]:
# Historical CLV (actual)
rfm['Historical_CLV'] = rfm['Monetary']

# Predicted CLV - Multiple approaches

# Approach 1: Simple extrapolation
# CLV = Average Order Value × Purchase Frequency × Customer Lifespan
avg_customer_lifespan_years = 2  # Assumption
rfm['Simple_CLV'] = (rfm['TotalAmount_mean'] * rfm['Frequency'] * 
                     (365 * avg_customer_lifespan_years / (rfm['Recency'] + 1)))

# Approach 2: Time-based extrapolation
# Extrapolate based on customer's historical behavior
rfm['Purchase_Rate'] = rfm['Frequency'] / (rfm['customer_lifespan_days'] + 1)
rfm['Expected_Purchases_Per_Year'] = rfm['Purchase_Rate'] * 365
rfm['Time_Based_CLV'] = (rfm['TotalAmount_mean'] * 
                         rfm['Expected_Purchases_Per_Year'] * 
                         avg_customer_lifespan_years)

# Approach 3: Weighted CLV considering recency
# Recent customers get higher weight
max_recency = rfm['Recency'].max()
rfm['Recency_Weight'] = 1 - (rfm['Recency'] / max_recency)
rfm['Weighted_CLV'] = rfm['Time_Based_CLV'] * (0.5 + 0.5 * rfm['Recency_Weight'])

# Approach 4: BG/NBD-inspired probability
# Probability customer is "alive"
rfm['Prob_Alive'] = np.exp(-rfm['Recency'] / rfm['avg_days_between_purchases'].replace(0, rfm['Recency'].mean()))
rfm['Probabilistic_CLV'] = rfm['Time_Based_CLV'] * rfm['Prob_Alive']

print("CLV Summary Statistics:")
print(rfm[['Historical_CLV', 'Simple_CLV', 'Time_Based_CLV', 'Weighted_CLV', 'Probabilistic_CLV']].describe())

In [None]:
# Compare CLV approaches
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

clv_columns = ['Simple_CLV', 'Time_Based_CLV', 'Weighted_CLV', 'Probabilistic_CLV']
titles = ['Simple CLV', 'Time-Based CLV', 'Weighted CLV', 'Probabilistic CLV']

for idx, (col, title) in enumerate(zip(clv_columns, titles)):
    ax = axes[idx // 2, idx % 2]
    
    # Remove extreme outliers for visualization
    data = rfm[rfm[col] <= rfm[col].quantile(0.95)][col]
    
    ax.hist(data, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
    ax.axvline(data.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: £{data.mean():.2f}')
    ax.axvline(data.median(), color='green', linestyle='--', linewidth=2, label=f'Median: £{data.median():.2f}')
    
    ax.set_xlabel('CLV (£)', fontsize=11)
    ax.set_ylabel('Number of Customers', fontsize=11)
    ax.set_title(f'{title} Distribution', fontsize=13, fontweight='bold')
    ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# CLV by segment
clv_by_segment = rfm.groupby('Segment')[['Historical_CLV', 'Simple_CLV', 'Time_Based_CLV', 
                                          'Weighted_CLV', 'Probabilistic_CLV']].mean().round(2)

print("Average CLV by Segment:")
clv_by_segment

In [None]:
# Visualize CLV by segment
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Historical vs Predicted CLV
clv_comparison = clv_by_segment[['Historical_CLV', 'Probabilistic_CLV']].sort_values('Historical_CLV', ascending=False)
clv_comparison.plot(kind='barh', ax=axes[0], color=['steelblue', 'coral'])
axes[0].set_xlabel('Average CLV (£)', fontsize=12)
axes[0].set_title('Historical vs Predicted CLV by Segment', fontsize=14, fontweight='bold')
axes[0].legend(['Historical CLV', 'Predicted CLV'])

# All CLV methods comparison for Champions
champions_clv = rfm[rfm['Segment'] == 'Champions'][clv_columns].mean()
champions_clv.plot(kind='bar', ax=axes[1], color='darkgreen')
axes[1].set_xlabel('CLV Method', fontsize=12)
axes[1].set_ylabel('Average CLV (£)', fontsize=12)
axes[1].set_title('Champions: CLV by Different Methods', fontsize=14, fontweight='bold')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 6. Cohort Analysis

In [None]:
# Determine customer cohort (first purchase month)
customer_cohort = df_clean.groupby('CustomerID')['InvoiceDate'].min().reset_index()
customer_cohort.columns = ['CustomerID', 'CohortDate']
customer_cohort['CohortMonth'] = customer_cohort['CohortDate'].dt.to_period('M')

# Merge with transaction data
df_cohort = df_clean.merge(customer_cohort[['CustomerID', 'CohortMonth']], on='CustomerID')

# Calculate cohort index (months since first purchase)
df_cohort['CohortIndex'] = (df_cohort['YearMonth'] - df_cohort['CohortMonth']).apply(lambda x: x.n)

# Cohort analysis: retention
cohort_data = df_cohort.groupby(['CohortMonth', 'CohortIndex'])['CustomerID'].nunique().reset_index()
cohort_pivot = cohort_data.pivot(index='CohortMonth', columns='CohortIndex', values='CustomerID')

# Calculate retention rates
cohort_size = cohort_pivot.iloc[:, 0]
retention_matrix = cohort_pivot.divide(cohort_size, axis=0) * 100

print("Cohort Retention Matrix (first 6 months):")
print(retention_matrix.iloc[:, :7].round(2))

In [None]:
# Visualize cohort retention
plt.figure(figsize=(14, 8))
sns.heatmap(retention_matrix.iloc[:, :12], annot=True, fmt='.0f', cmap='YlGnBu', 
            cbar_kws={'label': 'Retention %'})
plt.title('Cohort Retention Analysis (First 12 Months)', fontsize=14, fontweight='bold')
plt.xlabel('Cohort Index (Months Since First Purchase)', fontsize=12)
plt.ylabel('Cohort Month', fontsize=12)
plt.tight_layout()
plt.show()

## 7. Churn Risk Analysis

In [None]:
# Calculate churn indicators
avg_recency = rfm['Recency'].mean()
avg_frequency = rfm['Frequency'].mean()

# Churn risk score (0-10)
# Higher score = higher churn risk
rfm['Churn_Risk_Score'] = (
    (rfm['Recency'] / rfm['Recency'].max() * 4) +  # Recency impact (40%)
    ((1 - rfm['Frequency'] / rfm['Frequency'].max()) * 3) +  # Frequency impact (30%)
    ((1 - rfm['Monetary'] / rfm['Monetary'].max()) * 2) +  # Monetary impact (20%)
    ((1 - rfm['Prob_Alive']) * 1)  # Probability alive (10%)
)

# Normalize to 0-10
rfm['Churn_Risk_Score'] = (rfm['Churn_Risk_Score'] / rfm['Churn_Risk_Score'].max() * 10).round(2)

# Categorize churn risk
rfm['Churn_Risk_Category'] = pd.cut(rfm['Churn_Risk_Score'], 
                                     bins=[0, 3, 6, 10], 
                                     labels=['Low Risk', 'Medium Risk', 'High Risk'])

print("Churn Risk Distribution:")
print(rfm['Churn_Risk_Category'].value_counts())

print("\nChurn Risk by Segment:")
print(rfm.groupby('Segment')['Churn_Risk_Score'].mean().sort_values(ascending=False).round(2))

In [None]:
# Visualize churn risk
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Churn risk distribution
rfm['Churn_Risk_Category'].value_counts().plot(kind='bar', ax=axes[0], color=['green', 'orange', 'red'])
axes[0].set_xlabel('Churn Risk Category', fontsize=12)
axes[0].set_ylabel('Number of Customers', fontsize=12)
axes[0].set_title('Churn Risk Distribution', fontsize=14, fontweight='bold')
axes[0].tick_params(axis='x', rotation=0)

# Churn risk by segment
churn_by_segment = rfm.groupby('Segment')['Churn_Risk_Score'].mean().sort_values(ascending=False)
colors_risk = ['red' if x > 6 else 'orange' if x > 3 else 'green' for x in churn_by_segment.values]
churn_by_segment.plot(kind='barh', ax=axes[1], color=colors_risk)
axes[1].set_xlabel('Average Churn Risk Score', fontsize=12)
axes[1].set_title('Average Churn Risk by Segment', fontsize=14, fontweight='bold')
axes[1].axvline(6, color='red', linestyle='--', alpha=0.5, label='High Risk Threshold')
axes[1].axvline(3, color='orange', linestyle='--', alpha=0.5, label='Medium Risk Threshold')
axes[1].legend()

plt.tight_layout()
plt.show()

## 8. High-Value At-Risk Customers

In [None]:
# Identify high-CLV customers at risk of churning
high_value_threshold = rfm['Probabilistic_CLV'].quantile(0.75)
high_risk_threshold = 6

high_value_at_risk = rfm[
    (rfm['Probabilistic_CLV'] >= high_value_threshold) & 
    (rfm['Churn_Risk_Score'] >= high_risk_threshold)
].sort_values('Probabilistic_CLV', ascending=False)

print(f"High-Value At-Risk Customers: {len(high_value_at_risk)}")
print(f"Total CLV at Risk: £{high_value_at_risk['Probabilistic_CLV'].sum():,.2f}")
print(f"Average CLV at Risk: £{high_value_at_risk['Probabilistic_CLV'].mean():,.2f}")

print("\nTop 20 High-Value At-Risk Customers:")
high_value_at_risk[['CustomerID', 'Segment', 'Recency', 'Frequency', 'Monetary', 
                    'Probabilistic_CLV', 'Churn_Risk_Score']].head(20)

In [None]:
# Scatter plot: CLV vs Churn Risk
plt.figure(figsize=(12, 8))

# Color by segment
segments = rfm['Segment'].unique()
colors = plt.cm.tab10(np.linspace(0, 1, len(segments)))
segment_colors = {seg: colors[i] for i, seg in enumerate(segments)}

for segment in segments:
    seg_data = rfm[rfm['Segment'] == segment]
    plt.scatter(seg_data['Churn_Risk_Score'], seg_data['Probabilistic_CLV'], 
               label=segment, alpha=0.6, s=50, c=[segment_colors[segment]])

# Highlight high-value at-risk zone
plt.axhline(high_value_threshold, color='red', linestyle='--', alpha=0.5, label='High CLV Threshold')
plt.axvline(high_risk_threshold, color='orange', linestyle='--', alpha=0.5, label='High Risk Threshold')

# Annotate the danger zone
plt.text(7.5, rfm['Probabilistic_CLV'].max() * 0.9, 'HIGH VALUE\nAT RISK', 
         fontsize=14, fontweight='bold', color='darkred', alpha=0.7,
         bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.3))

plt.xlabel('Churn Risk Score', fontsize=12)
plt.ylabel('Predicted CLV (£)', fontsize=12)
plt.title('Customer Lifetime Value vs Churn Risk', fontsize=14, fontweight='bold')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 9. Strategic Recommendations

In [None]:
# Comprehensive business metrics
total_customers = len(rfm)
total_clv = rfm['Probabilistic_CLV'].sum()
avg_clv = rfm['Probabilistic_CLV'].mean()

high_risk_customers = len(rfm[rfm['Churn_Risk_Category'] == 'High Risk'])
high_risk_clv = rfm[rfm['Churn_Risk_Category'] == 'High Risk']['Probabilistic_CLV'].sum()

champions = rfm[rfm['Segment'] == 'Champions']
champions_clv = champions['Probabilistic_CLV'].sum()

print("=" * 70)
print("STRATEGIC BUSINESS RECOMMENDATIONS - CLV FOCUSED")
print("=" * 70)

print(f"\n1. OVERALL PORTFOLIO VALUE")
print(f"   Total Customers: {total_customers:,}")
print(f"   Total Predicted CLV: £{total_clv:,.2f}")
print(f"   Average CLV per Customer: £{avg_clv:,.2f}")

print(f"\n2. CHAMPIONS (Highest Value)")
print(f"   Count: {len(champions):,} ({len(champions)/total_customers*100:.1f}%)")
print(f"   Total CLV: £{champions_clv:,.2f} ({champions_clv/total_clv*100:.1f}%)")
print(f"   Average CLV: £{champions['Probabilistic_CLV'].mean():,.2f}")
print(f"   → ACTION: VIP program, early access to products, dedicated support")

print(f"\n3. HIGH-VALUE AT-RISK CUSTOMERS (Priority Intervention)")
print(f"   Count: {len(high_value_at_risk):,}")
print(f"   CLV at Risk: £{high_value_at_risk['Probabilistic_CLV'].sum():,.2f}")
print(f"   → ACTION: Immediate re-engagement campaign, personalized offers")
print(f"   → PRIORITY: Top 100 customers = £{high_value_at_risk.head(100)['Probabilistic_CLV'].sum():,.2f} potential loss")

print(f"\n4. CHURN RISK OVERVIEW")
print(f"   High Risk: {high_risk_customers:,} customers ({high_risk_customers/total_customers*100:.1f}%)")
print(f"   CLV at Risk: £{high_risk_clv:,.2f} ({high_risk_clv/total_clv*100:.1f}%)")
print(f"   → ACTION: Win-back campaigns, identify common churn patterns")

print(f"\n5. RETENTION ECONOMICS")
# Assume 5% retention improvement
retention_improvement = 0.05
recoverable_value = high_risk_clv * retention_improvement
print(f"   If retention improves by 5%: +£{recoverable_value:,.2f} in CLV")
print(f"   ROI Target: Spend up to £{recoverable_value * 0.3:,.2f} on retention (30% of value)")

print(f"\n6. SEGMENT-SPECIFIC CLV STRATEGIES")
for segment in rfm['Segment'].unique():
    seg_data = rfm[rfm['Segment'] == segment]
    seg_clv = seg_data['Probabilistic_CLV'].sum()
    seg_avg_clv = seg_data['Probabilistic_CLV'].mean()
    seg_churn = seg_data['Churn_Risk_Score'].mean()
    print(f"\n   {segment}:")
    print(f"   - Total CLV: £{seg_clv:,.2f} ({seg_clv/total_clv*100:.1f}%)")
    print(f"   - Avg CLV: £{seg_avg_clv:,.2f} | Avg Churn Risk: {seg_churn:.2f}")

print("\n" + "=" * 70)

## 10. Export Results

In [None]:
# Export comprehensive results
rfm.to_csv('online_retail_ii_rfm_clv_analysis.csv', index=False)
print("Full RFM + CLV analysis exported")

# Export high-priority customers
high_value_at_risk.to_csv('high_value_at_risk_customers.csv', index=False)
print("High-value at-risk customers exported")

# Export cohort analysis
retention_matrix.to_csv('cohort_retention_matrix.csv')
print("Cohort retention matrix exported")