# =================================================================
# Consumer Purchasing Behavior Analysis
# =================================================================
# Goal: Segment customers into meaningful behavioral groups using PCA and K-Means
# to support targeted marketing, promotions, and retention strategies.
# Key Tasks:
#   1. Load and clean the marketing campaign dataset.
#   2. Engineer key features (age, tenure, total spending, campaign responses).
#   3. Scale/encode features, reduce dimensionality with PCA.
#   4. Use K-Means clustering with silhouette and elbow methods to choose k.
#   5. Profile and interpret clusters (personas) for business insights.
# Tools: Python, pandas, numpy, matplotlib, seaborn, scikit-learn, joblib
# Dataset: Kaggle (Consumer Behavior dataset)

In [None]:
# ---------- IMPORT LIBRARIES ----------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import math
import os

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from pathlib import Path

In [None]:
# ---------- PROJECT SETUP ----------
project_dir = Path.cwd()

fig_dir     = project_dir / "figures"
models_dir  = project_dir / "models"
outputs_dir = project_dir / "outputs"

for d in [fig_dir, models_dir, outputs_dir]:
    d.mkdir(exist_ok=True)

def save_plot(filename, width=8, height=5, dpi=300):
    """
    Save the current Matplotlib figure into the figures/ folder.
    """
    plt.gcf().set_size_inches(width, height)
    plt.savefig(fig_dir / filename, dpi=dpi, bbox_inches="tight")

def save_output(df, filename):
    """
    Save a pandas DataFrame into the outputs/ directory as CSV.
    """
    filepath = outputs_dir / filename
    df.to_csv(filepath, index=False)
    print(f"Saved output: {filepath}")
    return filepath

# -------------------- PLOT STYLE --------------------
sns.set(style="whitegrid")
plt.rc("font", size=14)
plt.rc("axes", labelsize=14, titlesize=14)
plt.rc("legend", fontsize=12)
plt.rc("xtick", labelsize=10)
plt.rc("ytick", labelsize=10)

# ---------- AXIS LABELS WITH UNITS ----------
axis_labels = {
    "Age": "Age (years)",
    "Income": "Annual Income (currency units)",
    "Total_Spent": "Total Spending (currency units)",
    "Customer_Tenure": "Customer Tenure (days since enrollment)",
    "Recency": "Recency (days since last purchase)",
}

In [None]:
# -------------------- LOAD DATA --------------------
data_path = project_dir / "consumer_behavior_data.csv"

consumer_behavior_df = pd.read_csv(
    data_path,
    sep="\t"
)

print("Raw shape:", consumer_behavior_df.shape)

print("\nHead (first 5 rows):")
print(consumer_behavior_df.head().to_string(index=False))

print("\nDtypes:")
print(consumer_behavior_df.dtypes)

In [None]:
# ---------- DATA CLEANING & FEATURE ENGINEERING ----------
# Parse dates
consumer_behavior_df["Dt_Customer"] = pd.to_datetime(
    consumer_behavior_df["Dt_Customer"],
    dayfirst=True,          
    errors="coerce"
)

# Drop rows with invalid dates
consumer_behavior_df = consumer_behavior_df.dropna(subset=["Dt_Customer"])

# Handle missing Income
if consumer_behavior_df["Income"].isna().sum() > 0:
    consumer_behavior_df = consumer_behavior_df.dropna(subset=["Income"])

# Age
consumer_behavior_df["Age"] = 2025 - consumer_behavior_df["Year_Birth"]

# Remove unrealistic ages
consumer_behavior_df = consumer_behavior_df[
    (consumer_behavior_df["Age"] >= 18) & (consumer_behavior_df["Age"] <= 90)
]

# Children & Parent flag
consumer_behavior_df["Children"] = (
    consumer_behavior_df["Kidhome"] + consumer_behavior_df["Teenhome"]
)
consumer_behavior_df["IsParent"] = (consumer_behavior_df["Children"] > 0).astype(int)

# Customer tenure (days since enrollment)
max_dt = consumer_behavior_df["Dt_Customer"].max()
consumer_behavior_df["Customer_Tenure"] = (
    max_dt - consumer_behavior_df["Dt_Customer"]
).dt.days

# Total amount spent across product categories
mnt_cols = [
    "MntWines", "MntFruits", "MntMeatProducts",
    "MntFishProducts", "MntSweetProducts", "MntGoldProds"
]

consumer_behavior_df["Total_Spent"] = consumer_behavior_df[mnt_cols].sum(axis=1)

# Total accepted campaigns
campaign_cols = ["AcceptedCmp1", "AcceptedCmp2", "AcceptedCmp3",
                 "AcceptedCmp4", "AcceptedCmp5"]
existing_campaign_cols = [c for c in campaign_cols if c in consumer_behavior_df.columns]
consumer_behavior_df["Total_Accepted_Campaigns"] = consumer_behavior_df[existing_campaign_cols].sum(axis=1)

print("\nAfter cleaning & feature engineering (sample rows):")
cols_preview = [
    "ID", "Age", "Education", "Marital_Status", "Income",
    "Children", "IsParent", "Customer_Tenure", "Total_Spent",
    "Recency", "Total_Accepted_Campaigns"
]
print(consumer_behavior_df[cols_preview].head().to_string(index=False))

print("\nMissing values by column:")
print(consumer_behavior_df.isnull().sum().to_string())

# ---------- EXPLORATORY DATA ANALYSIS (EDA) ----------
# Histograms of key continuous features
eda_features = ["Age", "Income", "Total_Spent", "Customer_Tenure", "Recency"]
n_cols = 3
n_features = len(eda_features)
n_rows = math.ceil(n_features / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))
axes = axes.flatten()

for i, col in enumerate(eda_features):
    axes[i].hist(consumer_behavior_df[col], bins=30, edgecolor="black")
    axes[i].set_title(f"Distribution of {col}")
    axes[i].set_xlabel(axis_labels.get(col, col))
    axes[i].set_ylabel("Frequency")

# Remove any empty subplot axes
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.suptitle("Key Continuous Feature Distributions", y=1.02)
plt.tight_layout()
save_plot("eda_histograms_key_features.png", width=15, height=4 * n_rows, dpi=300)
plt.show()

# Boxplot of Total_Spent by Marital_Status
plt.figure(figsize=(8, 6))
sns.boxplot(data=consumer_behavior_df, x="Marital_Status", y="Total_Spent")
plt.title("Total Spending by Marital Status")
plt.xlabel("Marital Status")
plt.ylabel(axis_labels["Total_Spent"])
plt.xticks(rotation=45)
plt.tight_layout()
save_plot("total_spent_by_marital_status.png", width=8, height=6, dpi=300)
plt.show()

In [None]:
# ---------- FEATURE SELECTION FOR CLUSTERING ----------
numeric_features = [
    "Age",
    "Income",
    "Children",
    "Customer_Tenure",
    "Recency",
    "Total_Spent",
    "NumDealsPurchases",
    "NumWebPurchases",
    "NumCatalogPurchases",
    "NumStorePurchases",
    "NumWebVisitsMonth",
    "Complain",
    "Total_Accepted_Campaigns"
] + mnt_cols

categorical_features = ["Education", "Marital_Status"]

consumer_behavior_df_model = consumer_behavior_df[
    numeric_features + categorical_features
].dropna()

print("\nModeling dataframe shape:", consumer_behavior_df_model.shape)

X = consumer_behavior_df_model.copy()

In [None]:
# ---------- PREPROCESSING PIPELINE ----------
numeric_transformer = RobustScaler()
categorical_transformer = OneHotEncoder(drop="first", handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# ---------- PCA (DIMENSIONALITY REDUCTION) ----------
X_preprocessed = preprocessor.fit_transform(X)

RANDOM_STATE = 42
pca_full = PCA(random_state=RANDOM_STATE)
X_pca_full = pca_full.fit_transform(X_preprocessed)

explained_variance_ratio = pca_full.explained_variance_ratio_
cumulative_explained = np.cumsum(explained_variance_ratio)

plt.figure(figsize=(8, 5))
plt.plot(
    range(1, len(cumulative_explained) + 1),
    cumulative_explained,
    marker="o"
)
plt.title("PCA – Cumulative Explained Variance")
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance Ratio")
plt.grid(True)
plt.tight_layout()
save_plot("pca_cumulative_explained_variance.png", width=8, height=5, dpi=300)
plt.show()

# Save explained-variance table
pca_variance_df = pd.DataFrame({
    "component": np.arange(1, len(explained_variance_ratio) + 1),
    "explained_variance_ratio": explained_variance_ratio,
    "cumulative_explained_variance": cumulative_explained
})
save_output(pca_variance_df, "pca_explained_variance.csv")

# Choose a fixed number of components
N_COMPONENTS = 8
pca = PCA(n_components=N_COMPONENTS, random_state=RANDOM_STATE)

In [None]:
# ---------- K-MEANS MODEL SELECTION (SILHOUETTE & ELBOW) ----------
k_values = range(2, 9)
sil_scores = []
inertias = []

for k in k_values:
    kmeans_k = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=10)

    # Build a small pipeline for each k
    pipe_k = Pipeline([
        ("preprocess", preprocessor),
        ("pca", PCA(n_components=N_COMPONENTS, random_state=RANDOM_STATE)),
        ("cluster", kmeans_k)
    ])

    X_pre_k = pipe_k["preprocess"].fit_transform(X)
    X_pca_k = pipe_k["pca"].fit_transform(X_pre_k)
    labels_k = pipe_k["cluster"].fit_predict(X_pca_k)

    sil = silhouette_score(X_pca_k, labels_k)
    sil_scores.append(sil)
    inertias.append(pipe_k["cluster"].inertia_)

# Silhouette plot
plt.figure(figsize=(8, 5))
sns.lineplot(x=list(k_values), y=sil_scores, marker="o")
plt.title("Silhouette Score vs Number of Clusters (k)")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.grid(True)
plt.tight_layout()
save_plot("silhouette_score_vs_k.png", width=8, height=5, dpi=300)
plt.show()

# Elbow plot
plt.figure(figsize=(8, 5))
sns.lineplot(x=list(k_values), y=inertias, marker="o")
plt.title("Elbow Method – Inertia vs Number of Clusters (k)")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia (Within-Cluster SSE)")
plt.grid(True)
plt.tight_layout()
save_plot("elbow_method_inertia_vs_k.png", width=8, height=5, dpi=300)
plt.show()

# Save k-selection metrics
k_selection_df = pd.DataFrame({
    "k": list(k_values),
    "silhouette_score": sil_scores,
    "inertia": inertias
})
save_output(k_selection_df, "k_selection_metrics.csv")

In [None]:
# ---------- FIT FINAL K-MEANS MODEL ----------
# Choose final k based on silhouette & elbow
BEST_K = 4

final_kmeans = KMeans(n_clusters=BEST_K, random_state=RANDOM_STATE, n_init=10)

clustering_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("pca", pca),
    ("cluster", final_kmeans)
])

clustering_pipeline.fit(X)

# Transform data for visualization & store cluster labels
X_pca_final = clustering_pipeline["pca"].transform(
    clustering_pipeline["preprocess"].transform(X)
)
cluster_labels = clustering_pipeline["cluster"].labels_

consumer_behavior_df_model["cluster"] = cluster_labels

# ---------- CLUSTER PROFILING ----------
cluster_profile = (
    consumer_behavior_df_model
    .groupby("cluster")[numeric_features + ["Total_Spent"]]
    .mean()
    .round(2)
)

print("\nCluster Profile (mean values):")
print(cluster_profile.to_string())

# Save cluster profile
cluster_profile_out = cluster_profile.reset_index()
save_output(cluster_profile_out, "cluster_profile.csv")

# Cluster sizes
cluster_counts = consumer_behavior_df_model["cluster"].value_counts().sort_index()
print("\nCluster sizes (number of customers per cluster):")
print(cluster_counts)

cluster_counts_df = cluster_counts.reset_index()
cluster_counts_df.columns = ["cluster", "count"]
save_output(cluster_counts_df, "cluster_sizes.csv")

# Education distribution by cluster
print("\nEducation distribution by cluster (proportion):")
edu_dist = (
    consumer_behavior_df_model.groupby("cluster")["Education"]
    .value_counts(normalize=True)
    .rename("proportion")
)
print(edu_dist.to_string())

edu_dist_df = edu_dist.reset_index()
save_output(edu_dist_df, "education_distribution_by_cluster.csv")

# Marital status distribution by cluster
print("\nMarital status distribution by cluster (proportion):")
marital_dist = (
    consumer_behavior_df_model.groupby("cluster")["Marital_Status"]
    .value_counts(normalize=True)
    .rename("proportion")
)
print(marital_dist.to_string())

marital_dist_df = marital_dist.reset_index()
save_output(marital_dist_df, "marital_status_distribution_by_cluster.csv")

In [None]:
# ---------- PCA 2D SCATTERPLOT ----------
plt.figure(figsize=(8, 6))
scatter = plt.scatter(
    X_pca_final[:, 0],
    X_pca_final[:, 1],
    c=cluster_labels,
    cmap="viridis",
    alpha=0.7
)
plt.title("Customer Segments (PCA 2D Projection)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid(True)
plt.tight_layout()
save_plot("pca_clusters_scatter.png", width=8, height=6, dpi=300)
plt.show()

In [None]:
# --- SAVE MODEL ---
model_path = models_dir / "customer_personality_clustering_pipeline.pkl"
joblib.dump(clustering_pipeline, model_path)
print(f"\nSaved clustering pipeline to: {model_path}")

# -------------------- PIPELINE COMPLETE --------------------
print("\nPipeline complete:")
print("- Project setup (figures/, outputs/, models/ directories)")
print("- Data loading & validation (shape, head preview, dtypes)")
print("- Data cleaning & feature engineering (age, tenure, spending, campaigns)")
print("- Missing-value handling and sanity checks")
print("- Exploratory data analysis (feature distributions, spending by marital status)")
print("- Feature selection for clustering (numeric + categorical)")
print("- Preprocessing pipeline (Robust scaling + one-hot encoding)")
print("- PCA analysis (explained variance + component selection)")
print("- K-Means model selection (silhouette scores + elbow method)")
print(f"- Final K-Means clustering fit (k = {BEST_K}) with PCA projection")
print("- Cluster profiling (means, sizes, education & marital distributions)")
print("- Cluster visualization (2D PCA scatter)")
print("- Artifacts saved to /figures, /outputs, and /models")
print(f"- Final clustering pipeline saved as {model_path.name}")