In [None]:
# customer_segmentation.py

import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# 1. Load dataset
df = pd.read_excel("OnlineRetail.xlsx")
print(f"Rows loaded: {len(df)}")

# 2. Clean data
df = df.dropna(subset=['CustomerID'])
df = df[df.Quantity > 0]
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['TotalPrice'] = df.Quantity * df.UnitPrice

# 3. Calculate RFM metrics
snapshot_date = df['InvoiceDate'].max() + dt.timedelta(days=1)
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
    'InvoiceNo': 'nunique',
    'TotalPrice': 'sum'
})
rfm.rename(columns={'InvoiceDate': 'Recency',
                    'InvoiceNo': 'Frequency',
                    'TotalPrice': 'Monetary'}, inplace=True)

# 4. RFM scoring (1–5)
rfm['R_Score'] = pd.qcut(rfm['Recency'], 5, labels=[5,4,3,2,1])
rfm['F_Score'] = pd.qcut(rfm['Frequency'], 5, labels=[1,2,3,4,5])
rfm['M_Score'] = pd.qcut(rfm['Monetary'], 5, labels=[1,2,3,4,5])
rfm['RFM_Score'] = rfm['R_Score'].astype(str) + rfm['F_Score'].astype(str) + rfm['M_Score'].astype(str)

# 5. Save RFM file
rfm.to_csv("rfm_scores.csv")
print("Saved: rfm_scores.csv")

# 6. K-means clustering (advanced)
features = rfm[['Recency', 'Frequency', 'Monetary']]
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Choosing k via elbow method
inertia = []
K = range(2, 11)
for k in K:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(features_scaled)
    inertia.append(km.inertia_)

import matplotlib.pyplot as plt
plt.figure(figsize=(8,4))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method: Optimal k')
plt.savefig("elbow_plot.png")
print("Saved: elbow_plot.png")

# Fit KMeans with chosen k
chosen_k = 4  # change based on elbow_plot
km = KMeans(n_clusters=chosen_k, random_state=42)
rfm['Cluster'] = km.fit_predict(features_scaled)
rfm.to_csv("rfm_clusters.csv")
print("Saved: rfm_clusters.csv")

# 7. Visualizations
plt.figure(figsize=(10,6))
sns.countplot(x='RFM_Score', data=rfm, order=sorted(rfm['RFM_Score'].unique()))
plt.xticks(rotation=90)
plt.title('Customer Count by RFM Segment')
plt.savefig("rfm_segment_counts.png")

plt.figure(figsize=(8,6))
sns.scatterplot(x='Recency', y='Monetary', hue='Cluster', data=rfm, palette='tab10')
plt.title('Clusters: Recency vs Monetary')
plt.savefig("cluster_scatter.png")

print("Visuals saved: rfm_segment_counts.png, cluster_scatter.png")
