# 🛍️ Customer Segmentation using K-Means Clustering

This notebook demonstrates how to perform customer segmentation using K-Means clustering on the Mall Customer Segmentation dataset.

## 🎯 Objectives
- Segment customers based on Annual Income and Spending Score
- Use the Elbow method to find optimal number of clusters
- Visualize customer segments
- Generate business insights for targeted marketing

## 📊 Dataset Overview
The Mall Customer Segmentation Data contains information about customers including:
- CustomerID: Unique identifier
- Gender: Male/Female
- Age: Customer age
- Annual Income (k$): Customer's annual income in thousands
- Spending Score (1-100): Score assigned based on customer behavior and spending nature

In [None]:
   "source": [
    "# Import required libraries
",
    "import pandas as pd
",
    "import numpy as np
",
    "import matplotlib.pyplot as plt
",
    "import seaborn as sns
",
    "from sklearn.cluster import KMeans
",
    "from sklearn.preprocessing import StandardScaler
",
    "from sklearn.metrics import silhouette_score
",
    "from mpl_toolkits.mplot3d import Axes3D
",
    "import os
",
    "import warnings
",
    "warnings.filterwarnings('ignore')
",
    "
",
    "# Set style for better plots
",
    "plt.style.use('seaborn-v0_8')
",
    "sns.set_palette("husl")
",
    "
",
    "print("✅ Libraries imported successfully!")"

In [None]:
# Auto-save setup - Create results directory (remove old since results are deterministic)
import os
import shutil
from datetime import datetime

results_dir = "analysis_results"
if os.path.exists(results_dir):
    shutil.rmtree(results_dir)
os.makedirs(results_dir)

print(f"📁 Results will be saved to: {results_dir}")
print(f"💡 Since analysis is deterministic, previous results will be overwritten")
print("=" * 70)

## 📥 Load and Explore Data

In [None]:
# Load the dataset (with auto-generation if needed)
def load_or_create_dataset(file_path='Mall_Customers.csv'):
    if os.path.exists(file_path):
        print(f"✅ Loading existing dataset: {file_path}")
        return pd.read_csv(file_path)
    else:
        print(f"📥 Dataset not found. Creating sample dataset...")
        
        np.random.seed(42)
        n_customers = 200
        
        # Generate sample data
        customer_ids = range(1, n_customers + 1)
        genders = np.random.choice(['Male', 'Female'], n_customers, p=[0.45, 0.55])
        ages = np.random.normal(35, 12, n_customers)
        ages = np.clip(ages, 18, 70).astype(int)
        
        base_income = np.random.normal(60, 25, n_customers)
        age_factor = (ages - 18) / 52
        income_adjustment = age_factor * 20
        annual_income = base_income + income_adjustment
        annual_income = np.clip(annual_income, 15, 140).astype(int)
        
        spending_scores = []
        for i in range(n_customers):
            if annual_income[i] < 40:
                score = np.random.uniform(1, 40) if np.random.random() < 0.7 else np.random.uniform(60, 100)
            elif annual_income[i] < 80:
                score = np.random.uniform(30, 70) if np.random.random() < 0.5 else np.random.uniform(10, 90)
            else:
                score = np.random.uniform(10, 50) if np.random.random() < 0.4 else np.random.uniform(60, 100)
            spending_scores.append(int(score))
        
        data = pd.DataFrame({
            'CustomerID': customer_ids,
            'Gender': genders,
            'Age': ages,
            'Annual Income (k$)': annual_income,
            'Spending Score (1-100)': spending_scores
        })
        
        data.to_csv(file_path, index=False)
        print(f"✅ Sample dataset created and saved as '{file_path}'")
        return data

data = load_or_create_dataset()

# Store dataset information
results['dataset_info'] = {
    'shape': data.shape,
    'columns': list(data.columns),
    'basic_stats': data.describe().to_dict(),
    'missing_values': data.isnull().sum().to_dict(),
    'data_types': data.dtypes.to_dict()
}

# Save dataset info to file
with open(f"{results_dir}/dataset_info.txt", "w") as f:
    f.write("DATASET INFORMATION\n")
    f.write("=" * 50 + "\n\n")
    f.write(f"Shape: {data.shape}\n")
    f.write(f"Columns: {list(data.columns)}\n\n")
    f.write("Statistical Summary:\n")
    f.write(data.describe().to_string())
    f.write("\n\nMissing Values:\n")
    f.write(data.isnull().sum().to_string())

print(f"Dataset shape: {data.shape}")
print(f"Columns: {list(data.columns)}")
print("\nFirst 5 rows:")
data.head()

In [None]:
# Basic information about the dataset
print("Dataset Info:")
print(data.info())
print("\nStatistical Summary:")
data.describe()

In [None]:
# Check for missing values
print("Missing values:")
missing_values = data.isnull().sum()
if missing_values.sum() == 0:
    print("✅ No missing values found!")
else:
    print(missing_values[missing_values > 0])

## 📊 Data Visualization and Exploration

In [None]:
# Create comprehensive visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Customer Data Exploration', fontsize=16, fontweight='bold')

# Age distribution
axes[0, 0].hist(data['Age'], bins=20, alpha=0.7, edgecolor='black', color='skyblue')
axes[0, 0].set_title('Age Distribution')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Frequency')

# Annual Income distribution
axes[0, 1].hist(data['Annual Income (k$)'], bins=20, alpha=0.7, edgecolor='black', color='lightgreen')
axes[0, 1].set_title('Annual Income Distribution')
axes[0, 1].set_xlabel('Annual Income (k$)')
axes[0, 1].set_ylabel('Frequency')

# Spending Score distribution
axes[0, 2].hist(data['Spending Score (1-100)'], bins=20, alpha=0.7, edgecolor='black', color='salmon')
axes[0, 2].set_title('Spending Score Distribution')
axes[0, 2].set_xlabel('Spending Score (1-100)')
axes[0, 2].set_ylabel('Frequency')

# Gender distribution
gender_counts = data['Gender'].value_counts()
axes[1, 0].pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%', startangle=90)
axes[1, 0].set_title('Gender Distribution')

# Income vs Spending scatter plot
axes[1, 1].scatter(data['Annual Income (k$)'], data['Spending Score (1-100)'], alpha=0.6, s=50)
axes[1, 1].set_xlabel('Annual Income (k$)')
axes[1, 1].set_ylabel('Spending Score (1-100)')
axes[1, 1].set_title('Income vs Spending Score')
axes[1, 1].grid(True, alpha=0.3)

# Age vs Income scatter plot
axes[1, 2].scatter(data['Age'], data['Annual Income (k$)'], alpha=0.6, s=50, c='orange')
axes[1, 2].set_xlabel('Age')
axes[1, 2].set_ylabel('Annual Income (k$)')
axes[1, 2].set_title('Age vs Income')
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
# Save the plot
plt.savefig(f"{results_dir}/01_data_exploration.png", dpi=300, bbox_inches='tight')
plt.show()

results['visualizations'].append("01_data_exploration.png")

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
numeric_cols = data.select_dtypes(include=[np.number]).columns
correlation_matrix = data[numeric_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
           square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix of Numeric Features', fontsize=14, fontweight='bold')
plt.tight_layout()
# Save the correlation matrix
plt.savefig(f"{results_dir}/02_correlation_matrix.png", dpi=300, bbox_inches='tight')
plt.show()

# Save correlation data
correlation_matrix.to_csv(f"{results_dir}/correlation_matrix.csv")
results['visualizations'].append("02_correlation_matrix.png")

## 🔧 Data Preprocessing for Clustering

In [None]:
# Select features for clustering
features = ['Annual Income (k$)', 'Spending Score (1-100)']
X = data[features].copy()

print(f"Features selected for clustering: {features}")
print(f"Data shape: {X.shape}")
print("\nFeature statistics:")
print(X.describe())

In [None]:
# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("✅ Features normalized using StandardScaler")
print(f"Scaled data shape: {X_scaled.shape}")
print(f"Mean of scaled features: {X_scaled.mean(axis=0)}")
print(f"Std of scaled features: {X_scaled.std(axis=0)}")

## 🔍 Finding Optimal Number of Clusters

We'll use two methods to determine the optimal number of clusters:
1. **Elbow Method**: Look for the "elbow" in the WCSS plot
2. **Silhouette Analysis**: Find the K with the highest silhouette score

In [None]:
# Calculate WCSS and Silhouette scores for different K values
max_k = 10
wcss = []
silhouette_scores = []
k_range = range(1, max_k + 1)

print(f"Testing K values from 1 to {max_k}...")

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)
    
    # Calculate silhouette score (skip for k=1 as it's undefined)
    if k > 1:
        score = silhouette_score(X_scaled, kmeans.labels_)
        silhouette_scores.append(score)
        print(f"K={k}: WCSS={kmeans.inertia_:.2f}, Silhouette Score={score:.3f}")
    else:
        silhouette_scores.append(0)
        print(f"K={k}: WCSS={kmeans.inertia_:.2f}")

print("\n✅ Analysis completed!")

In [None]:
# Plot Elbow curve and Silhouette scores
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Elbow Method
ax1.plot(k_range, wcss, 'bo-', linewidth=2, markersize=8)
ax1.set_xlabel('Number of Clusters (K)', fontsize=12)
ax1.set_ylabel('WCSS (Within-Cluster Sum of Squares)', fontsize=12)
ax1.set_title('Elbow Method for Optimal K', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)

# Add annotations for key points
for i, (k, w) in enumerate(zip(k_range, wcss)):
    if k in [2, 3, 4, 5]:  # Highlight potential elbow points
        ax1.annotate(f'K={k}\nWCSS={w:.0f}', 
                    (k, w), textcoords="offset points", 
                    xytext=(0,10), ha='center', fontsize=10)

# Silhouette Score
ax2.plot(range(2, max_k + 1), silhouette_scores[1:], 'ro-', linewidth=2, markersize=8)
ax2.set_xlabel('Number of Clusters (K)', fontsize=12)
ax2.set_ylabel('Silhouette Score', fontsize=12)
ax2.set_title('Silhouette Analysis', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)

# Find and highlight the best silhouette score
best_k = silhouette_scores[1:].index(max(silhouette_scores[1:])) + 2
best_score = max(silhouette_scores[1:])
ax2.annotate(f'Best K={best_k}\nScore={best_score:.3f}', 
            (best_k, best_score), textcoords="offset points", 
            xytext=(0,15), ha='center', fontsize=12, 
            bbox=dict(boxstyle="round,pad=0.3", facecolor="yellow", alpha=0.7))

plt.tight_layout()
# Save the elbow analysis
plt.savefig(f"{results_dir}/03_elbow_silhouette_analysis.png", dpi=300, bbox_inches='tight')
plt.show()

# Save elbow analysis data
elbow_data = pd.DataFrame({
    'K': k_range,
    'WCSS': wcss,
    'Silhouette_Score': silhouette_scores
})
elbow_data.to_csv(f"{results_dir}/elbow_analysis_data.csv", index=False)
results['visualizations'].append("03_elbow_silhouette_analysis.png")

print(f"\n📊 Analysis Results:")
print(f"   🎯 Best K based on Silhouette Score: {best_k}")
print(f"   📈 Best Silhouette Score: {best_score:.3f}")

# Store optimal K results
results['performance_metrics']['optimal_k'] = best_k
results['performance_metrics']['best_silhouette_score'] = best_score

## 🎯 Perform K-Means Clustering

In [None]:
# Perform clustering with optimal K
optimal_k = 4  # Based on silhouette analysis

print(f"Performing K-Means clustering with K={optimal_k}")

# Create and fit the model
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)

# Add cluster labels to original data
data['Cluster'] = cluster_labels

# Calculate clustering performance metrics
silhouette_avg = silhouette_score(X_scaled, cluster_labels)
inertia = kmeans.inertia_

print(f"\n✅ Clustering completed!")
print(f"   📊 Silhouette Score: {silhouette_avg:.3f}")
print(f"   📊 WCSS (Inertia): {inertia:.2f}")
print(f"   🎪 Number of clusters: {optimal_k}")

# Display cluster distribution
cluster_counts = pd.Series(cluster_labels).value_counts().sort_index()
print(f"\n📈 Cluster Distribution:")
for cluster, count in cluster_counts.items():
    percentage = (count / len(cluster_labels)) * 100
    print(f"   Cluster {cluster}: {count} customers ({percentage:.1f}%)")

# Save clustered data
data.to_csv(f"{results_dir}/clustered_customer_data.csv", index=False)

# Store clustering results
results['performance_metrics'].update({
    'final_silhouette_score': silhouette_avg,
    'inertia': inertia,
    'n_clusters': optimal_k,
    'cluster_distribution': cluster_counts.to_dict()
})

## 📋 Cluster Analysis and Interpretation

In [None]:
# Analyze cluster characteristics
print("📋 DETAILED CLUSTER ANALYSIS")
print("="*50)

# Calculate cluster centers in original scale
cluster_centers_scaled = kmeans.cluster_centers_
cluster_centers_original = scaler.inverse_transform(cluster_centers_scaled)

# Create DataFrame for cluster centers
centers_df = pd.DataFrame(cluster_centers_original, columns=features)
centers_df['Cluster'] = range(len(centers_df))

print("\n🎯 Cluster Centers (Original Scale):")
print(centers_df.round(2))

# Save cluster centers
centers_df.to_csv(f"{results_dir}/cluster_centers.csv", index=False)

# Detailed analysis for each cluster
cluster_analysis = []

for cluster_id in sorted(data['Cluster'].unique()):
    cluster_data = data[data['Cluster'] == cluster_id]
    
    analysis = {
        'Cluster': cluster_id,
        'Count': len(cluster_data),
        'Percentage': len(cluster_data)/len(data)*100,
        'Avg_Age': cluster_data['Age'].mean(),
        'Avg_Income': cluster_data['Annual Income (k$)'].mean(),
        'Avg_Spending': cluster_data['Spending Score (1-100)'].mean(),
        'Gender_Female_%': (cluster_data['Gender'] == 'Female').sum() / len(cluster_data) * 100
    }
    cluster_analysis.append(analysis)
    
    print(f"\n🏷️ CLUSTER {cluster_id} ANALYSIS:")
    print(f"   📊 Size: {len(cluster_data)} customers ({len(cluster_data)/len(data)*100:.1f}%)")
    print(f"   👥 Gender: {(cluster_data['Gender'] == 'Female').sum() / len(cluster_data) * 100:.1f}% Female")
    print(f"   📈 Average Age: {cluster_data['Age'].mean():.1f} years")
    print(f"   💰 Average Income: ${cluster_data['Annual Income (k$)'].mean():.1f}k")
    print(f"   🛍️ Average Spending Score: {cluster_data['Spending Score (1-100)'].mean():.1f}")

# Create summary DataFrame
cluster_summary = pd.DataFrame(cluster_analysis)
print("\n📊 Cluster Summary Table:")
print(cluster_summary.round(2))

# Save cluster analysis
cluster_summary.to_csv(f"{results_dir}/cluster_analysis_summary.csv", index=False)
results['cluster_analysis'] = cluster_summary.to_dict('records')

## 🎨 Cluster Visualization

In [None]:
# Create comprehensive cluster visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Customer Cluster Analysis Visualizations', fontsize=16, fontweight='bold')

# Color palette for clusters
colors = ['red', 'blue', 'green', 'purple', 'orange']

# 1. Income vs Spending Score scatter plot
ax1 = axes[0, 0]
for i in range(optimal_k):
    cluster_data = data[data['Cluster'] == i]
    ax1.scatter(cluster_data['Annual Income (k$)'], 
               cluster_data['Spending Score (1-100)'], 
               c=colors[i], label=f'Cluster {i}', alpha=0.7, s=50)

# Plot cluster centers
ax1.scatter(cluster_centers_original[:, 0], cluster_centers_original[:, 1], 
           c='black', marker='X', s=200, linewidths=3, label='Centroids')

ax1.set_xlabel('Annual Income (k$)')
ax1.set_ylabel('Spending Score (1-100)')
ax1.set_title('Income vs Spending Score')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Age vs Income scatter plot
ax2 = axes[0, 1]
for i in range(optimal_k):
    cluster_data = data[data['Cluster'] == i]
    ax2.scatter(cluster_data['Age'], cluster_data['Annual Income (k$)'], 
               c=colors[i], label=f'Cluster {i}', alpha=0.7, s=50)

ax2.set_xlabel('Age')
ax2.set_ylabel('Annual Income (k$)')
ax2.set_title('Age vs Income by Cluster')
ax2.legend()
ax2.grid(True, alpha=0.3)

# 3. Cluster size distribution
ax3 = axes[1, 0]
cluster_counts = data['Cluster'].value_counts().sort_index()
bars = ax3.bar(cluster_counts.index, cluster_counts.values, 
               color=[colors[i] for i in range(optimal_k)], alpha=0.7)
ax3.set_xlabel('Cluster')
ax3.set_ylabel('Number of Customers')
ax3.set_title('Cluster Size Distribution')

# Add value labels on bars
for bar, count in zip(bars, cluster_counts.values):
    ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
             str(count), ha='center', va='bottom', fontweight='bold')

# 4. Average metrics by cluster
ax4 = axes[1, 1]
cluster_metrics = data.groupby('Cluster')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean()
cluster_metrics_norm = cluster_metrics.div(cluster_metrics.max()) * 100  # Normalize to 0-100 scale

x = np.arange(len(cluster_metrics.index))
width = 0.25

ax4.bar(x - width, cluster_metrics_norm['Age'], width, label='Age (normalized)', alpha=0.7)
ax4.bar(x, cluster_metrics_norm['Annual Income (k$)'], width, label='Income (normalized)', alpha=0.7)
ax4.bar(x + width, cluster_metrics_norm['Spending Score (1-100)'], width, label='Spending Score', alpha=0.7)

ax4.set_xlabel('Cluster')
ax4.set_ylabel('Normalized Values (0-100)')
ax4.set_title('Average Metrics by Cluster')
ax4.set_xticks(x)
ax4.set_xticklabels([f'Cluster {i}' for i in range(optimal_k)])
ax4.legend()

plt.tight_layout()
# Save cluster visualizations
plt.savefig(f"{results_dir}/04_cluster_visualizations.png", dpi=300, bbox_inches='tight')
plt.show()

results['visualizations'].append("04_cluster_visualizations.png")

In [None]:
# Create 3D visualization if we have enough features
fig = plt.figure(figsize=(12, 9))
ax = fig.add_subplot(111, projection='3d')

# Create 3D scatter plot with Age, Income, and Spending Score
for i in range(optimal_k):
    cluster_data = data[data['Cluster'] == i]
    ax.scatter(cluster_data['Age'], 
               cluster_data['Annual Income (k$)'], 
               cluster_data['Spending Score (1-100)'],
               c=colors[i], label=f'Cluster {i}', alpha=0.7, s=50)

ax.set_xlabel('Age')
ax.set_ylabel('Annual Income (k$)')
ax.set_zlabel('Spending Score (1-100)')
ax.set_title('3D Customer Clusters\n(Age, Income, Spending Score)', fontsize=14, fontweight='bold')
ax.legend()

plt.tight_layout()
# Save 3D visualization
plt.savefig(f"{results_dir}/05_3d_cluster_visualization.png", dpi=300, bbox_inches='tight')
plt.show()

results['visualizations'].append("05_3d_cluster_visualization.png")

## 💡 Business Insights and Recommendations

Based on the clustering analysis, we can identify distinct customer segments and develop targeted strategies.

In [None]:
# Generate business insights
print("💡 BUSINESS INSIGHTS & RECOMMENDATIONS")
print("="*60)

# Define cluster characteristics based on analysis
insights = {
    0: {
        'name': '🌟 Premium Customers',
        'description': 'High income, high spending customers',
        'strategies': [
            '🎯 Target with luxury and premium products',
            '💎 Implement exclusive loyalty programs',
            '🏆 Offer VIP customer service',
            '📧 Send personalized high-end product recommendations'
        ]
    },
    1: {
        'name': '💰 Budget-Conscious Customers',
        'description': 'Lower income, conservative spending',
        'strategies': [
            '🏷️ Focus on value deals and discounts',
            '📦 Promote essential and practical products',
            '💳 Offer payment plans and financing options',
            '📱 Use price-sensitive marketing channels'
        ]
    },
    2: {
        'name': '⚠️ Budget Enthusiasts',
        'description': 'Moderate income but high spending tendency',
        'strategies': [
            '🎪 Promote trendy and fashionable items',
            '📊 Encourage bulk purchases with discounts',
            '🔄 Implement cashback and reward programs',
            '⏰ Create urgency with limited-time offers'
        ]
    },
    3: {
        'name': '🎯 Conservative High-Income',
        'description': 'High income but conservative spending habits',
        'strategies': [
            '📚 Provide detailed product information and reviews',
            '✨ Emphasize quality and durability',
            '🔍 Offer comparison tools and guides',
            '🎓 Use educational marketing approaches'
        ]
    }
}

# Generate business insights report
business_report = []
insights_text = "BUSINESS INSIGHTS & RECOMMENDATIONS\n" + "="*60 + "\n\n"

# Display insights for each cluster
for cluster_id in sorted(data['Cluster'].unique()):
    cluster_data = data[data['Cluster'] == cluster_id]
    insight = insights.get(cluster_id, {'name': f'Cluster {cluster_id}', 'description': '', 'strategies': []})
    
    cluster_insight = {
        'cluster_id': cluster_id,
        'name': insight['name'],
        'description': insight['description'],
        'size': len(cluster_data),
        'percentage': len(cluster_data)/len(data)*100,
        'avg_income': cluster_data['Annual Income (k$)'].mean(),
        'avg_spending': cluster_data['Spending Score (1-100)'].mean(),
        'strategies': insight['strategies']
    }
    business_report.append(cluster_insight)
    
    print(f"\n{insight['name']} (Cluster {cluster_id}):")
    print(f"   📊 Size: {len(cluster_data)} customers ({len(cluster_data)/len(data)*100:.1f}%)")
    print(f"   📝 Profile: {insight['description']}")
    print(f"   💰 Avg Income: ${cluster_data['Annual Income (k$)'].mean():.0f}k")
    print(f"   🛍️ Avg Spending Score: {cluster_data['Spending Score (1-100)'].mean():.0f}")
    print(f"   🎯 Marketing Strategies:")
    for strategy in insight['strategies']:
        print(f"      {strategy}")
    
    # Add to text report
    insights_text += f"\n{insight['name']} (Cluster {cluster_id}):\n"
    insights_text += f"   Size: {len(cluster_data)} customers ({len(cluster_data)/len(data)*100:.1f}%)\n"
    insights_text += f"   Profile: {insight['description']}\n"
    insights_text += f"   Avg Income: ${cluster_data['Annual Income (k$)'].mean():.0f}k\n"
    insights_text += f"   Avg Spending Score: {cluster_data['Spending Score (1-100)'].mean():.0f}\n"
    insights_text += f"   Marketing Strategies:\n"
    for strategy in insight['strategies']:
        insights_text += f"      {strategy}\n"

print(f"\n🎯 OVERALL STRATEGIC RECOMMENDATIONS:")
print(f"   🔄 Develop cluster-specific marketing campaigns")
print(f"   📊 Monitor customer migration between clusters over time")
print(f"   🎨 Customize website experience for each segment")
print(f"   📈 Set cluster-specific KPIs and success metrics")
print(f"   🤝 Create cross-cluster upselling opportunities")
print(f"   📧 Implement segment-based email marketing")

# Add overall recommendations to text report
insights_text += f"\nOVERALL STRATEGIC RECOMMENDATIONS:\n"
insights_text += f"   • Develop cluster-specific marketing campaigns\n"
insights_text += f"   • Monitor customer migration between clusters over time\n"
insights_text += f"   • Customize website experience for each segment\n"
insights_text += f"   • Set cluster-specific KPIs and success metrics\n"
insights_text += f"   • Create cross-cluster upselling opportunities\n"
insights_text += f"   • Implement segment-based email marketing\n"

# Save business insights
with open(f"{results_dir}/business_insights.txt", "w") as f:
    f.write(insights_text)

# Save business insights as CSV
business_df = pd.DataFrame(business_report)
business_df.to_csv(f"{results_dir}/business_insights.csv", index=False)

results['business_insights'] = business_report

## 📊 Cluster Performance Summary

In [None]:
# Create a comprehensive summary
print("📊 FINAL CLUSTERING SUMMARY")
print("="*50)

print(f"✅ Successfully segmented {len(data)} customers into {optimal_k} distinct clusters")
print(f"📈 Clustering Quality Metrics:")
print(f"   🎯 Silhouette Score: {silhouette_avg:.3f} (Range: -1 to 1, higher is better)")
print(f"   📊 WCSS (Inertia): {inertia:.2f} (Lower is better)")

print(f"\n🎪 Cluster Distribution:")
for cluster_id in sorted(data['Cluster'].unique()):
    count = len(data[data['Cluster'] == cluster_id])
    percentage = count / len(data) * 100
    print(f"   Cluster {cluster_id}: {count:3d} customers ({percentage:5.1f}%)")

print(f"\n🔍 Key Insights:")
print(f"   • Income and spending behavior show clear segmentation patterns")
print(f"   • Four distinct customer personas identified")
print(f"   • Each cluster requires different marketing approaches")
print(f"   • Opportunity for targeted product recommendations")

print(f"\n🚀 Next Steps:")
print(f"   1. Implement cluster-based marketing campaigns")
print(f"   2. Develop cluster-specific product recommendations")
print(f"   3. Monitor cluster performance and evolution")
print(f"   4. A/B test different strategies for each segment")
print(f"   5. Collect additional data to refine segmentation")

# Save final summary
final_summary = f"""CUSTOMER SEGMENTATION ANALYSIS - FINAL SUMMARY
{'='*60}

EXECUTION DETAILS:
Analysis Date: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Results Directory: {results_dir}

DATASET OVERVIEW:
• Total Customers: {len(data)}
• Features Used: {features}
• Dataset Shape: {data.shape}

CLUSTERING RESULTS:
• Optimal K: {optimal_k}
• Silhouette Score: {silhouette_avg:.3f}
• WCSS (Inertia): {inertia:.2f}

CLUSTER DISTRIBUTION:
"""

for cluster_id in sorted(data['Cluster'].unique()):
    count = len(data[data['Cluster'] == cluster_id])
    percentage = count / len(data) * 100
    final_summary += f"• Cluster {cluster_id}: {count:3d} customers ({percentage:5.1f}%)\n"

final_summary += f"""
GENERATED FILES:
• Dataset Info: dataset_info.txt
• Clustered Data: clustered_customer_data.csv
• Cluster Centers: cluster_centers.csv
• Cluster Analysis: cluster_analysis_summary.csv
• Business Insights: business_insights.txt & business_insights.csv
• Correlation Matrix: correlation_matrix.csv
• Elbow Analysis: elbow_analysis_data.csv
• Visualizations: {', '.join(results['visualizations'])}

KEY INSIGHTS:
• Income and spending behavior show clear segmentation patterns
• Four distinct customer personas identified
• Each cluster requires different marketing approaches
• Opportunity for targeted product recommendations

NEXT STEPS:
1. Implement cluster-based marketing campaigns
2. Develop cluster-specific product recommendations
3. Monitor cluster performance and evolution
4. A/B test different strategies for each segment
5. Collect additional data to refine segmentation

Project completed successfully! ✨
"""

# Save the final summary
with open(f"{results_dir}/00_FINAL_SUMMARY.txt", "w") as f:
    f.write(final_summary)

print(f"\n✨ Project completed successfully! ✨")
print(f"\n📁 ALL ANALYSIS RESULTS SAVED TO: {results_dir}")
print(f"📄 Files generated: {len(results['visualizations']) + 7} files")
print(f"🎨 Visualizations created: {len(results['visualizations'])} plots")

# Display comprehensive results summary
print("\n" + "="*80)
print("🎉 COMPREHENSIVE ANALYSIS SUMMARY - ALL RESULTS AT ONCE")
print("="*80)

print(f"\n📊 DATASET SUMMARY:")
print(f"   • Shape: {results['dataset_info']['shape']}")
print(f"   • Columns: {results['dataset_info']['columns']}")
print(f"   • Missing Values: {sum(results['dataset_info']['missing_values'].values())} total")

print(f"\n🔍 CLUSTERING PERFORMANCE:")
print(f"   • Optimal K: {results['performance_metrics']['optimal_k']}")
print(f"   • Best Silhouette Score: {results['performance_metrics']['best_silhouette_score']:.3f}")
print(f"   • Final Silhouette Score: {results['performance_metrics']['final_silhouette_score']:.3f}")
print(f"   • WCSS (Inertia): {results['performance_metrics']['inertia']:.2f}")

print(f"\n🎪 CLUSTER BREAKDOWN:")
for cluster_info in results['cluster_analysis']:
    print(f"   • Cluster {cluster_info['Cluster']}: {cluster_info['Count']} customers ({cluster_info['Percentage']:.1f}%)")
    print(f"     Income: ${cluster_info['Avg_Income']:.0f}k | Spending: {cluster_info['Avg_Spending']:.0f} | Age: {cluster_info['Avg_Age']:.0f}")

print(f"\n💡 BUSINESS SEGMENTS:")
for insight in results['business_insights']:
    print(f"   • {insight['name']}: {insight['description']}")
    print(f"     Size: {insight['size']} customers | Strategies: {len(insight['strategies'])} recommended")

print(f"\n📁 SAVED FILES:")
print(f"   • Main Directory: {results_dir}")
print(f"   • Data Files: 6 CSV/TXT files")
print(f"   • Visualizations: {len(results['visualizations'])} PNG files")
print(f"   • Summary Report: 00_FINAL_SUMMARY.txt")

print(f"\n🎯 QUICK ACCESS TO KEY INSIGHTS:")
print(f"   1. 🌟 Premium Customers: {[r for r in results['business_insights'] if 'Premium' in r['name']][0]['size']} customers")
print(f"   2. 💰 Budget-Conscious: {[r for r in results['business_insights'] if 'Budget-Conscious' in r['name']][0]['size']} customers") 
print(f"   3. ⚠️ Budget Enthusiasts: {[r for r in results['business_insights'] if 'Enthusiasts' in r['name']][0]['size']} customers")
print(f"   4. 🎯 Conservative High-Income: {[r for r in results['business_insights'] if 'Conservative' in r['name']][0]['size']} customers")

print(f"\n✅ ANALYSIS COMPLETE! All results are saved and displayed above.")
print(f"📂 Open '{results_dir}' folder to access all generated files.")