# RETAIL STORE SALES ANALYSIS - SECTION E
## Advanced Analysis - RFM Customer Segmentation (Question 10)

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import warnings
import os
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")

print("Libraries imported successfully!")

## Load Data

In [None]:
notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)
transformed_path = os.path.join(project_root, 'data', 'processed', 'transformed_sales_data.csv')

df = pd.read_csv(transformed_path)
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])

print(f"Data loaded: {df.shape[0]} transactions")
print(f"Date range: {df['Transaction Date'].min().date()} to {df['Transaction Date'].max().date()}")
print(f"Unique customers: {df['Customer ID'].nunique():,}")

# Q10: Customer Segmentation
## Step 1: Calculate RFM Metrics

## Set Reference Date and Calculate

In [None]:
current_date = df['Transaction Date'].max() + pd.Timedelta(days=1)
print(f"Reference date for recency: {current_date.date()}\n")

rfm_df = df.groupby('Customer ID').agg({
    'Transaction Date': lambda x: (current_date - x.max()).days,
    'Transaction ID': 'count',
    'Total Spent': 'sum'
}).reset_index()

rfm_df.columns = ['Customer_ID', 'Recency', 'Frequency', 'Monetary']

print(f"RFM data created for {len(rfm_df)} customers")
print("\nFirst 10 customers:")
rfm_df.head(10)

## Explore RFM Distributions

In [None]:
print("RFM STATISTICS")
print("="*50)
print(rfm_df[['Recency', 'Frequency', 'Monetary']].describe().round(2))

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(rfm_df['Recency'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
axes[0].set_title('Recency Distribution', fontweight='bold')
axes[0].set_xlabel('Days Since Last Purchase')
axes[0].set_ylabel('Number of Customers')

axes[1].hist(rfm_df['Frequency'], bins=30, color='lightgreen', edgecolor='black', alpha=0.7)
axes[1].set_title('Frequency Distribution', fontweight='bold')
axes[1].set_xlabel('Number of Transactions')
axes[1].set_ylabel('Number of Customers')

axes[2].hist(rfm_df['Monetary'], bins=30, color='salmon', edgecolor='black', alpha=0.7)
axes[2].set_title('Monetary Distribution', fontweight='bold')
axes[2].set_xlabel('Total Spent ($)')
axes[2].set_ylabel('Number of Customers')

plt.tight_layout()
plt.show()

## Step 2: Standardize RFM Values

## Apply StandardScaler

In [None]:
rfm_features = rfm_df[['Recency', 'Frequency', 'Monetary']].copy()

scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_features)

print("RFM values standardized")
print(f"Scaled data shape: {rfm_scaled.shape}")
print("\nFirst 5 rows after standardization:")
pd.DataFrame(rfm_scaled[:5], columns=['Recency_scaled', 'Frequency_scaled', 'Monetary_scaled']).round(3)

## Step 3: Find Optimal Number of Clusters

## Calculate Inertia for Different k

In [None]:
inertias = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(rfm_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(rfm_scaled, kmeans.labels_))

## Plot Elbow Curve

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(range(1, 11), [None] + inertias, 'bo-', linewidth=2, markersize=8)
ax1.set_xlabel('Number of Clusters (k)', fontsize=12)
ax1.set_ylabel('Inertia', fontsize=12)
ax1.set_title('Elbow Method for Optimal k', fontweight='bold', fontsize=14)
ax1.grid(True, alpha=0.3)
ax1.axvline(x=3, color='red', linestyle='--', alpha=0.7, label='k=3 (suggested)')
ax1.legend()

ax2.plot(K_range, silhouette_scores, 'go-', linewidth=2, markersize=8)
ax2.set_xlabel('Number of Clusters (k)', fontsize=12)
ax2.set_ylabel('Silhouette Score', fontsize=12)
ax2.set_title('Silhouette Score by k', fontweight='bold', fontsize=14)
ax2.grid(True, alpha=0.3)
ax2.axvline(x=3, color='red', linestyle='--', alpha=0.7, label='k=3 (suggested)')
ax2.legend()

plt.tight_layout()
plt.savefig(os.path.join(project_root, 'reports/figures/rfm_elbow_curve.png'), dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Elbow curve saved")
print("k=3 appears optimal based on the elbow curve and silhouette scores.")

## Step 4: Apply K-Means with k=3

## Perform Clustering

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
rfm_df['Cluster'] = kmeans.fit_predict(rfm_scaled)

print("Clustering complete!")
print(f"\nCluster distribution:")
cluster_counts = rfm_df['Cluster'].value_counts().sort_index()
for cluster, count in cluster_counts.items():
    print(f"  Cluster {cluster}: {count} customers ({count/len(rfm_df)*100:.1f}%)")

## Step 5: Analyze Cluster Characteristics

## Calculate Cluster Statistics

In [None]:
cluster_stats = rfm_df.groupby('Cluster').agg({
    'Recency': ['mean', 'median', 'min', 'max'],
    'Frequency': ['mean', 'median', 'min', 'max'],
    'Monetary': ['mean', 'median', 'min', 'max'],
    'Customer_ID': 'count'
}).round(2)

cluster_stats.columns = ['Recency_Mean', 'Recency_Median', 'Recency_Min', 'Recency_Max',
                         'Freq_Mean', 'Freq_Median', 'Freq_Min', 'Freq_Max',
                         'Monetary_Mean', 'Monetary_Median', 'Monetary_Min', 'Monetary_Max',
                         'Customer_Count']

cluster_stats['% of Customers'] = (cluster_stats['Customer_Count'] / len(rfm_df) * 100).round(1)

print("CLUSTER CHARACTERISTICS")
print("="*80)
print(cluster_stats)

cluster_value = rfm_df.groupby('Cluster')['Monetary'].sum().round(2)
print(f"\nTotal value by cluster:")
for cluster, value in cluster_value.items():
    pct = (value / cluster_value.sum()) * 100
    print(f"  Cluster {cluster}: ${value:,.2f} ({pct:.1f}% of total)")

## 2D Cluster Visualizations

In [None]:
colors = ['red', 'blue', 'green']

fig, axes = plt.subplots(2, 2, figsize=(14, 12))

ax = axes[0, 0]
for cluster in range(3):
    data = rfm_df[rfm_df['Cluster'] == cluster]
    ax.scatter(data['Frequency'], data['Monetary'], 
               c=colors[cluster], label=f'Cluster {cluster}', alpha=0.6, s=50)
ax.set_xlabel('Frequency (Number of Transactions)', fontsize=11)
ax.set_ylabel('Monetary (Total Spent $)', fontsize=11)
ax.set_title('Customer Segments: Frequency vs Monetary', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

ax = axes[0, 1]
for cluster in range(3):
    data = rfm_df[rfm_df['Cluster'] == cluster]
    ax.scatter(data['Recency'], data['Monetary'], 
               c=colors[cluster], label=f'Cluster {cluster}', alpha=0.6, s=50)
ax.set_xlabel('Recency (Days Since Last Purchase)', fontsize=11)
ax.set_ylabel('Monetary (Total Spent $)', fontsize=11)
ax.set_title('Customer Segments: Recency vs Monetary', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

ax = axes[1, 0]
for cluster in range(3):
    data = rfm_df[rfm_df['Cluster'] == cluster]
    ax.scatter(data['Recency'], data['Frequency'], 
               c=colors[cluster], label=f'Cluster {cluster}', alpha=0.6, s=50)
ax.set_xlabel('Recency (Days Since Last Purchase)', fontsize=11)
ax.set_ylabel('Frequency (Number of Transactions)', fontsize=11)
ax.set_title('Customer Segments: Recency vs Frequency', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

ax = axes[1, 1]
cluster_sizes = rfm_df['Cluster'].value_counts().sort_index()
wedges, texts, autotexts = ax.pie(cluster_sizes, 
                                   labels=[f'Cluster {i}\n({size} cust)' for i, size in cluster_sizes.items()],
                                   autopct='%1.1f%%', colors=colors, startangle=90)
ax.set_title('Customer Segment Distribution', fontweight='bold')
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')

plt.tight_layout()
plt.savefig(os.path.join(project_root, 'reports/figures/rfm_clusters_2d.png'), dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ 2D cluster visualizations saved")

## 3D Visualization of Clusters

In [None]:
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

for cluster in range(3):
    data = rfm_df[rfm_df['Cluster'] == cluster]
    ax.scatter(data['Recency'], data['Frequency'], data['Monetary'],
               c=colors[cluster], label=f'Cluster {cluster}', s=50, alpha=0.7)

ax.set_xlabel('Recency (days)', fontsize=11, labelpad=10)
ax.set_ylabel('Frequency', fontsize=11, labelpad=10)
ax.set_zlabel('Monetary ($)', fontsize=11, labelpad=10)
ax.set_title('3D RFM Customer Segments', fontweight='bold', fontsize=14, pad=20)
ax.legend()
ax.view_init(elev=20, azim=45)

plt.tight_layout()
plt.savefig(os.path.join(project_root, 'reports/figures/rfm_clusters_3d.png'), dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ 3D cluster visualization saved")

## Interpret Clusters

In [None]:
print("CLUSTER INTERPRETATION")
print("="*80)

for cluster in range(3):
    print(f"\n{'‚≠ê'*40}")
    print(f"CLUSTER {cluster}")
    print(f"{'‚≠ê'*40}")
    
    avg_rec = cluster_stats.loc[cluster, 'Recency_Mean']
    avg_freq = cluster_stats.loc[cluster, 'Freq_Mean']
    avg_mon = cluster_stats.loc[cluster, 'Monetary_Mean']
    total_val = cluster_value[cluster]
    cust_count = cluster_stats.loc[cluster, 'Customer_Count']
    
    if avg_rec < 100 and avg_freq > 5 and avg_mon > 500:
        segment = "üåü VIP CUSTOMERS"
        desc = "High-value, frequent, recent purchasers"
    elif avg_rec > 200 or avg_freq < 2:
        segment = "‚ö†Ô∏è AT-RISK CUSTOMERS"
        desc = "Low engagement, may be lost if not reactivated"
    else:
        segment = "üìä REGULAR CUSTOMERS"
        desc = "Moderate engagement, potential for growth"
    
    print(f"\nüìå SEGMENT: {segment}")
    print(f"   {desc}")
    print(f"\nüìä STATISTICS:")
    print(f"   ‚Ä¢ Customer count: {int(cust_count):,} ({cluster_stats.loc[cluster, '% of Customers']:.1f}% of total)")
    print(f"   ‚Ä¢ Total value: ${total_val:,.2f} ({total_val/cluster_value.sum()*100:.1f}% of revenue)")
    print(f"   ‚Ä¢ Avg recency: {avg_rec:.0f} days since last purchase")
    print(f"   ‚Ä¢ Avg frequency: {avg_freq:.1f} transactions per customer")
    print(f"   ‚Ä¢ Avg monetary: ${avg_mon:.2f} per customer")

## Save RFM Results

In [None]:
reports_dir = os.path.join(project_root, 'reports')
os.makedirs(reports_dir, exist_ok=True)

rfm_df.to_csv(os.path.join(reports_dir, 'rfm_results.csv'), index=False)
cluster_stats.to_csv(os.path.join(reports_dir, 'cluster_statistics.csv'))

print("‚úÖ RFM results saved to reports/ folder")

# SECTION E SUMMARY

‚úÖ **Q10 Completed: RFM Customer Segmentation**

- RFM Model Built with 3 customer segments
- K-Means clustering applied on standardized data
- Clusters visualized in 2D and 3D
- Segment interpretation completed