In [1174]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [1175]:
data_raw = pd.read_csv("marketing_campaign.csv", sep="\t")
data = data_raw.copy()

In [1176]:
data = data.drop(columns=["ID", "Z_CostContact", "Z_Revenue"])
data['Income'] = data['Income'].fillna(data['Income'].median())

data["Dt_Customer"] = pd.to_datetime(data["Dt_Customer"], dayfirst=True)
data['Age'] = data['Dt_Customer'].max().year - data['Year_Birth']

data["Family_Size"] = data["Kidhome"] + data["Teenhome"]
data['Has_Children'] = (data['Family_Size'] > 0).astype(int)

#defining bins and labels for age
age_bins = [17, 25, 45, 65, 100]  # cut-off points
age_labels = [
    "Young Impulsive/Luxury (18–25)",
    "Budget-Conscious (26–45)",
    "Affluent Mid-Lifers (46–65)",
    "Senior Premium Buyers (65+)"
]
# Apply binning
data['AgeGroup'] = pd.cut(data['Age'], bins=age_bins, labels=age_labels)
le_age = LabelEncoder()
data['AgeGroup'] = le_age.fit_transform(data['AgeGroup'])

#binary encode
data['Is_In_Relationship'] = data['Marital_Status'].isin(['Married', 'Together']).astype(int)
data['Is_Single'] = data['Marital_Status'].isin(['Single','Divorced','Widow','Alone','Absurd','YOLO']).astype(int)

spend_features = ["MntWines","MntFruits","MntMeatProducts","MntFishProducts","MntSweetProducts","MntGoldProds"]
data["TotalSpend"] = data[spend_features].sum(axis=1)

data['Effective_Campaigns'] = (data['AcceptedCmp1'] + data['AcceptedCmp2'] + data['AcceptedCmp3'] + data['AcceptedCmp4'] + data['AcceptedCmp5'])

purchase_features = ["NumDealsPurchases","NumWebPurchases","NumCatalogPurchases","NumStorePurchases"]
data["TotalPurchases"] = data[purchase_features].sum(axis=1)

# binning education in 3 categories based on their expenditure and accepted campaigns
data['Education'] = data['Education'].replace(['PhD', 'Master', "Graduation"], "Graduated")
#Label encoding education
le_education = LabelEncoder()
data['Education_Encoded'] = le_education.fit_transform(data['Education'])

In [1177]:
X = data[["AgeGroup", "Education_Encoded"]].copy()
scaler = StandardScaler()
scaled_data = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=4, init="k-means++", random_state=42)
labels = kmeans.fit_predict(scaled_data)

print("Silhouette Score (KMeans):", silhouette_score(scaled_data, labels))

Silhouette Score (KMeans): 0.9150078823831548


In [1178]:
feature_groups = {
    "Demographics": ["AgeGroup", "Education_Encoded"],
    "Behavior": ["TotalSpend", "TotalPurchases", "Complain", "Response"],
    "Engagement": ["Effective_Campaigns", "Recency", "Complain"]
}

cluster_results = {}
for name, features in feature_groups.items():
    X = data[features]
    X_scaled = StandardScaler().fit_transform(X)

    kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    data[f"{name}_Cluster"] = labels

    sil_score = silhouette_score(X_scaled, labels)

    profile = data.groupby(f"{name}_Cluster")[features].mean().round(2)
    edu_dist = (data.groupby(f"{name}_Cluster")["Education"]
                   .value_counts(normalize=True)
                   .unstack(fill_value=0).round(2))
    marital_dist = (data.groupby(f"{name}_Cluster")["Marital_Status"]
                      .value_counts(normalize=True)
                      .unstack(fill_value=0).round(2))

    cluster_results[name] = {
        "silhouette": sil_score,
        "profile": profile,
        "edu_dist": edu_dist,
        "marital_dist": marital_dist
    }

for view, results in cluster_results.items():
    print(f"\n=== {view.upper()} CLUSTERS ===")
    print("Silhouette:", round(results["silhouette"], 3))
    # print("\nProfile:\n", results["profile"])
    # print("\nEducation Distribution:\n", results["edu_dist"])
    # print("\nMarital Distribution:\n", results["marital_dist"])



=== DEMOGRAPHICS CLUSTERS ===
Silhouette: 0.915

=== BEHAVIOR CLUSTERS ===
Silhouette: 0.589

=== ENGAGEMENT CLUSTERS ===
Silhouette: 0.517


In [1179]:
def create_robust_segment_assignment(data, save_encoders=True):
    # rule-based segments with balanced distribution

    income_high = data['Income'].quantile(0.65)      # Top 35% income
    income_mid = data['Income'].quantile(0.35)       # Above 35th percentile
    spend_high = data['TotalSpend'].quantile(0.65)   # Top 35% spending
    spend_mid = data['TotalSpend'].quantile(0.35)    # Above 35th percentile
    recency_active = data['Recency'].quantile(0.5)   # Bottom 50% (more recent)
    campaigns_active = data['Effective_Campaigns'].quantile(0.5)  # Above median
    purchases_active = data['TotalPurchases'].quantile(0.4)  # Above 40th percentile
    purchases_high = data['TotalPurchases'].quantile(0.7)  # Frequent buyers

    thresholds = {
        'income_high': income_high,
        'income_mid': income_mid,
        'spend_high': spend_high,
        'spend_mid': spend_mid,
        'recency_active': recency_active,
        'campaigns_active': campaigns_active,
        'purchases_active': purchases_active,
        'purchases_high': purchases_high
    }

    # Create reverse mapping for education as le_encoded can encode differently each time
    education_reverse_map = dict(zip(le_education.transform(le_education.classes_),
                                   le_education.classes_))

    if save_encoders:
        joblib.dump(le_education, 'education_encoder.pkl')
        joblib.dump(thresholds, 'segment_thresholds.pkl')
        joblib.dump(education_reverse_map, 'education_reverse_map.pkl')

    def assign_segment(row):
        # Premium Spenders: High income AND (High spending OR high engagement)
        if (row['TotalSpend'] > spend_high and (row['TotalPurchases'] > purchases_high or
             (row['Effective_Campaigns'] > campaigns_active or row['Recency'] <= recency_active))):
            return "Premium Spenders"

        # Value Seekers: Moderate+ income AND campaign responsive AND active
        elif ((row['Effective_Campaigns'] > 0 or row['Response'] == 1) and
              row['Recency'] <= recency_active):
            return "Value Seekers"

        # Budget-Conscious: Active customers (recent purchases) but not high spenders
        elif (row['Recency'] <= recency_active and
              row['TotalPurchases'] >= purchases_active and
              row['TotalSpend'] < spend_high):
            return "Budget-Conscious"

        # Dormant/Unengaged: Inactive customers (restrictive criteria)
        elif (row['Recency'] > data['Recency'].quantile(0.8) and  # Top 20% recency (very inactive)
              row['TotalPurchases'] < purchases_active and
              row['Effective_Campaigns'] == 0):
            return "Dormant/Unengaged"

        # Catch remaining customers - assign based on dominant characteristic
        else:
            if row['TotalSpend'] >= spend_mid:
                return "Budget-Conscious"  # Moderate spenders
            else:
                return "Value Seekers"  # Others who show some engagement
    return assign_segment, thresholds, education_reverse_map

segment_function, thresholds, edu_reverse_map = create_robust_segment_assignment(data)
data['Rule_Based_Segment'] = data.apply(segment_function, axis=1)

'''
FOR ANALYSIS OF SEGMENTATION & ITS DISTRIBUITON ONLY
Remove from final .pickl file
 '''

print(" RULE-BASED SEGMENT DISTRIBUTION:")
print("=" * 45)
segment_counts = data['Rule_Based_Segment'].value_counts()
for segment, count in segment_counts.items():
    percentage = (count / len(data)) * 100
    print(f"{segment}: {count} customers ({percentage:.1f}%)")

print("\n THRESHOLDS:")
print("=" * 25)
print(f"High Income: ${thresholds['income_high']:,.0f} (65th percentile)")
print(f"Mid Income: ${thresholds['income_mid']:,.0f} (35th percentile)")
print(f"High Spend: ${thresholds['spend_high']:,.0f} (65th percentile)")
print(f"Active Recency: {thresholds['recency_active']:.0f} days (median)")

# Check new distribution across views
print("\n VIEW-SPECIFIC ANALYSIS:")
print("=" * 40)

for view in ['Demographics', 'Behavior', 'Engagement']:
    if f'{view}_Cluster' in data.columns:
        print(f"\n{view.upper()} VIEW:")
        view_analysis = pd.crosstab(data[f'{view}_Cluster'], data['Rule_Based_Segment'], normalize='index') * 100
        print(view_analysis.round(1))

        # Show cluster mappings
        print(f"{view} Cluster Mappings:")
        for cluster in data[f'{view}_Cluster'].unique():
            cluster_data = data[data[f'{view}_Cluster'] == cluster]['Rule_Based_Segment']
            most_common = cluster_data.mode()[0]
            percentage = (cluster_data == most_common).mean() * 100
            print(f"  Cluster {cluster} -> {most_common} ({percentage:.1f}%)")

 RULE-BASED SEGMENT DISTRIBUTION:
Value Seekers: 711 customers (31.7%)
Budget-Conscious: 687 customers (30.7%)
Premium Spenders: 683 customers (30.5%)
Dormant/Unengaged: 159 customers (7.1%)

 THRESHOLDS:
High Income: $61,851 (65th percentile)
Mid Income: $41,653 (35th percentile)
High Spend: $790 (65th percentile)
Active Recency: 49 days (median)

 VIEW-SPECIFIC ANALYSIS:

DEMOGRAPHICS VIEW:
Rule_Based_Segment    Budget-Conscious  Dormant/Unengaged  Premium Spenders  \
Demographics_Cluster                                                          
0                                 21.4               12.4              23.1   
1                                 28.1                7.0              28.2   
2                                 36.7                6.3              32.6   
3                                 27.2                4.4              44.3   

Rule_Based_Segment    Value Seekers  
Demographics_Cluster                 
0                              43.2  
1              