In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Step 1: Load and explore the data**

What are the different variables in the dataset?
What do these variables tell us about the customer?
Are these useful variables to help us understand the customer behavior?

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/kaggle/input/customer-segmentation-dataset-1/customer_dataset.csv")

# Preview the data
print(df.head())
print(df.info())
print(df.describe())

# **Step 2: Data Cleaning**

Check if there are missing values and replace missing values with median if the variable is continuous.
Check if there are categorical variables and replace with numerical value.

In [None]:
# Handle missing values in Annual Income
df['Annual Income'] = df['Annual Income'].fillna(df['Annual Income'].median())

# Standardize column names (optional)
df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

# Encode categorical variables
df_encoded = pd.get_dummies(df, columns=['Lifestyle', 'Engagement_Level'], drop_first=True)

# **Step 3: Feature Scaling**

The data has to be normalized to ensure fair clustering, especially for distance based clustering algorithms like k-means. Else, the clustering can get biased by variables such as Income which have high values.

In [None]:
from sklearn.preprocessing import StandardScaler

# Select features for clustering
features = ['Age', 'Annual_Income', 'Spending_Score', 'Loyalty_Tier', 
            'Time_Spent_mins/week'] + [col for col in df_encoded.columns if 'Lifestyle_' in col or 'Engagement_Level_' in col]

# Scale features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_encoded[features])

# **Step 4: Determine optimal number of clusters (Elbow Method)**

Inertia is the sum of squared distances between each data point and the centroid of its assigned cluster.
- Lower inertia means tighter clusters.
- But too many clusters will always reduce inertia—so we need a balance.

The Elbow method plots the inertia of the different cluster sizes. The point at which the inertia plateaus is the correct number of clusters to extract.

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)

plt.plot(range(1, 11), inertia, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

# **Step 5 - Evaluate Cluster Quality (Silhouette Score)**

Assesses how well-separated clusters are, and ranges from -1 to 1.

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# Step 1: Preprocess your data (optional but recommended)
features = df.drop('Cluster', axis=1, errors='ignore')  # Drop existing labels if present
features_scaled = StandardScaler().fit_transform(features.select_dtypes(include='number'))

# Step 2: Try different values of k and compute silhouette scores
silhouette_scores = []
k_values = range(2, 10)  # You can adjust this range

for k in k_values:
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
    labels = kmeans.fit_predict(features_scaled)
    score = silhouette_score(features_scaled, labels)
    silhouette_scores.append(score)
    print(f"k = {k}, Silhouette Score = {score:.4f}")

In [None]:
import matplotlib.pyplot as plt

plt.plot(k_values, silhouette_scores, marker='o')
plt.title("Silhouette Score vs Number of Clusters")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.grid(True)
plt.show()

# **Step 6 : Apply Clustering Algorithm**

The first step is to decide the clustering algorithm. 
- Common choices:
    - K-Means (centroid-based, efficient)
    - Hierarchical (dendrogram-based, interpretable)
    - DBSCAN (density-based, handles noise)
- The decision depends on data shape, noise, and desired interpretability.

The next step is to decide the number of clusters to form. 
The output is to assign each customer to a cluster.

In [None]:
# Choose optimal k (e.g., 4)
kmeans = KMeans(n_clusters=4, random_state=42)
df['Cluster'] = kmeans.fit_predict(scaled_data)

# **Step 7: Visualize Clusters**

Interpret clusters visually and discuss business implications.

In [None]:
import seaborn as sns

# Loyalty tier vs. Spending Score colored by cluster
sns.scatterplot(data=df, x='Loyalty_Tier', y='Spending_Score', hue='Cluster', palette='Set2')
plt.title('Customer Segments by Loyalty Tier and Spending Score')
plt.show()

In [None]:
# Spending score vs. Annual Income colored by cluster
sns.scatterplot(data=df, x='Annual_Income', y='Spending_Score', hue='Cluster', palette='Set2')
plt.title('Customer Segments by Annual Income and Spending Score')
plt.show()

In [None]:
# Age vs. Spending Score colored by cluster
sns.scatterplot(data=df, x='Age', y='Spending_Score', hue='Cluster', palette='Set2')
plt.title('Customer Segments by Age and Spending Score')
plt.show()

In [None]:
# Lifestyle vs. Income colored by cluster
sns.scatterplot(data=df, x='Lifestyle', y='Annual_Income', hue='Cluster', palette='Set2')
plt.title('Customer Segments by Lifestyle and Income')
plt.show()

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Choose any three numeric variables for the axes
x_var = 'Age'
y_var = 'Annual_Income'
z_var = 'Spending_Score'

# Drop rows with missing values in selected columns
plot_df = df[[x_var, y_var, z_var, 'Cluster']].dropna()

# Create the 3D scatter plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

scatter = ax.scatter(
    plot_df[x_var],
    plot_df[y_var],
    plot_df[z_var],
    c=plot_df['Cluster'],
    cmap='viridis',
    s=50,
    alpha=0.8
)

# Label the axes
ax.set_xlabel(x_var)
ax.set_ylabel(y_var)
ax.set_zlabel(z_var)
ax.set_title("3D Cluster Plot")

# Add color legend
plt.colorbar(scatter, label='Cluster')
plt.show()

# **Step 8: Cluster Profiling**

Summarizes each cluster’s average characteristics.

In [None]:
# Group by cluster to understand characteristics
numeric_cols = df.select_dtypes(include='number').columns
cluster_profile = df.groupby('Cluster')[numeric_cols].mean()
print(cluster_profile)