In [None]:
### *Task 3: Customer Segmentation (Verified)*
#%%

pip install pandas numpy matplotlib seaborn scikit-learn plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity


# Prepare data for clustering
X = profile_encoded.drop(['CustomerID', 'CustomerName', 'SignupDate'], axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Find optimal clusters using DB Index
db_scores = []
cluster_range = range(2, 11)

for n in cluster_range:
    kmeans = KMeans(n_clusters=n, random_state=42)
    clusters = kmeans.fit_predict(X_scaled)
    db_scores.append(davies_bouldin_score(X_scaled, clusters))

# Plot DB Index
plt.figure(figsize=(10,6))
plt.plot(cluster_range, db_scores, marker='o')
plt.title('Davies-Bouldin Index vs Number of Clusters', fontsize=14)
plt.xlabel('Number of Clusters', fontsize=12)
plt.ylabel('DB Index', fontsize=12)
plt.xticks(cluster_range)
plt.savefig('db_index.png', bbox_inches='tight')
plt.show()

# Final clustering with optimal clusters
optimal_clusters = cluster_range[np.argmin(db_scores)]
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Visualize clusters with PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(10,6))
sns.scatterplot(
    x=X_pca[:,0], 
    y=X_pca[:,1], 
    hue=clusters, 
    palette='viridis', 
    s=100, 
    alpha=0.8
)
plt.title(f'Customer Segmentation (PCA Visualization) - {optimal_clusters} Clusters', fontsize=14)
plt.xlabel('Principal Component 1', fontsize=12)
plt.ylabel('Principal Component 2', fontsize=12)
plt.savefig('cluster_visualization.png', bbox_inches='tight')
plt.show()

# Cluster analysis
cluster_profile = customer_profile.copy()
cluster_profile['Cluster'] = clusters
cluster_analysis = cluster_profile.groupby('Cluster').agg({
    'total_spent': 'mean',
    'avg_transaction': 'mean',
    'transaction_count': 'mean',
    'unique_products': 'mean'
}).reset_index()

print("Cluster Analysis:")
print(cluster_analysis.to_markdown(index=False))