In [None]:
#You own the mall and want to understand the customers like who can be easily converge [Target Customers] so that the sense can be given to marketing team and plan the strategy accordingly.


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import warnings

# Set environment variable to avoid memory leak warning
os.environ["OMP_NUM_THREADS"] = "1"

# Suppress specific warning
warnings.filterwarnings("ignore", message="KMeans is known to have a memory leak on Windows with MKL")

# Load the data
data_path = 'D:\\Final_Projects\\Mall_Customers.csv'
df = pd.read_csv(data_path)

# Inspect the data
print(df.head())
print(df.info())
print(df.describe())

# Handle Missing Values
print(df.isnull().sum())

# One-hot encode the 'Genre' column
df_encoded = pd.get_dummies(df, columns=['Genre'], drop_first=True)

# Feature Selection
features = df[['Annual Income (k$)', 'Spending Score (1-100)']]

# Normalization
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Advanced EDA
plt.figure(figsize=(14, 7))

# Histograms
plt.subplot(2, 2, 1)
sns.histplot(df['Annual Income (k$)'], kde=True)
plt.title('Annual Income Distribution')

plt.subplot(2, 2, 2)
sns.histplot(df['Spending Score (1-100)'], kde=True)
plt.title('Spending Score Distribution')

# Boxplots
plt.subplot(2, 2, 3)
sns.boxplot(data=df['Annual Income (k$)'])
plt.title('Annual Income Boxplot')

plt.subplot(2, 2, 4)
sns.boxplot(data=df['Spending Score (1-100)'])
plt.title('Spending Score Boxplot')

plt.tight_layout()
plt.show()

# Pairplot
sns.pairplot(df_encoded[['Annual Income (k$)', 'Spending Score (1-100)', 'Age', 'Genre_Male']])
plt.show()

# Correlation Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df_encoded.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation')
plt.show()

# Determine the optimal number of clusters using the elbow method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(scaled_features)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 5))
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

# Train the KMeans Model
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=42)
kmeans.fit(scaled_features)

# Predict the cluster for each data point
clusters = kmeans.predict(scaled_features)

# Plot the clusters
plt.figure(figsize=(10, 5))
plt.scatter(scaled_features[:, 0], scaled_features[:, 1], c=clusters, s=50, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75, marker='X')
plt.title('Customer Segments')
plt.xlabel('Annual Income (scaled)')
plt.ylabel('Spending Score (scaled)')
plt.show()

# Step 1: Apply PCA to Reduce Dimensions
pca = PCA(n_components=2)
pca_features = pca.fit_transform(scaled_features)

# Step 2: Train KMeans on the Reduced Data
kmeans.fit(pca_features)
clusters = kmeans.predict(pca_features)
centers = kmeans.cluster_centers_

# Step 4: Plot the Clusters
plt.figure(figsize=(10, 5))
plt.scatter(pca_features[:, 0], pca_features[:, 1], c=clusters, s=50, cmap='viridis')
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75, marker='X')
plt.title('Customer Segments (PCA Reduced)')
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')
plt.show()

# Step 5: Analyze Cluster Centers
cluster_centers = pd.DataFrame(centers, columns=['PCA Feature 1', 'PCA Feature 2'])
print("Cluster Centers:\n", cluster_centers)

# Step 6: Add Cluster Labels to Original Data
pca_components = pd.DataFrame(pca_features, columns=['PCA Feature 1', 'PCA Feature 2'])
original_data_with_clusters = df.copy()
original_data_with_clusters['Cluster'] = clusters

# Display the first few rows of the dataset with cluster labels
print("Original Data with Cluster Labels:\n", original_data_with_clusters.head())

# Step 7: Evaluate the Clustering
sil_score = silhouette_score(pca_features, clusters)
print(f'Silhouette Score: {sil_score}')

# Step 8: Further Visualizations (Optional)
sns.pairplot(original_data_with_clusters, hue='Cluster', palette='viridis')
plt.show()

# Step 9: Save the Results
original_data_with_clusters.to_csv('customer_segments.csv', index=False)

# Additional Analyses

# Demographic Analysis by Clusters

# Age distribution by cluster
plt.figure(figsize=(10, 5))
sns.boxplot(x='Cluster', y='Age', data=original_data_with_clusters)
plt.title('Age Distribution by Cluster')
plt.show()

# Gender distribution by cluster
plt.figure(figsize=(10, 5))
sns.countplot(x='Cluster', hue='Genre', data=original_data_with_clusters)
plt.title('Gender Distribution by Cluster')
plt.show()

# Cluster Profiling
# Select only numeric columns for profiling
numeric_cols = original_data_with_clusters.select_dtypes(include=[np.number])
cluster_profiles = numeric_cols.groupby('Cluster').mean()
print("Cluster Profiles:\n", cluster_profiles)

# Marketing Strategies
cluster_strategies = {
    0: "Strategy for Cluster 0: Offer loyalty rewards and exclusive deals to retain these high-spending customers.",
    1: "Strategy for Cluster 1: Introduce premium products or services that cater to their affluent lifestyle, possibly encouraging higher spending.",
    2: "Strategy for Cluster 2: Implement budget-friendly promotions and discounts to increase engagement and spending."
}
print("Marketing Strategies:\n", cluster_strategies)

# Example: Plotly Dash for interactive visualization
import plotly.express as px

fig = px.scatter(original_data_with_clusters, x='Annual Income (k$)', y='Spending Score (1-100)', color='Cluster', hover_data=['Age', 'Genre'])
fig.show()
