# **Customer Segmentation using Purchasing Behavior and Clustering Algorithms**

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from google.colab import files
import ipywidgets as widgets
from IPython.display import display, clear_output

In [None]:
print("Please upload 'customer_segmentation_dataset.csv'")
uploaded = files.upload()
data = pd.read_csv('customer_segmentation_dataset.csv')
data.head(30)

In [None]:
features = ['Frequency', 'MonetaryValue', 'Recency']
X = data[features].fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Scaled Features (first 10 rows):\n", X_scaled[:10])

In [None]:
inertia = []
silhouette_scores = []
K = range(2, 10)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)
    if k > 1:
        score = silhouette_score(X_scaled, kmeans.labels_)
        silhouette_scores.append(score)
    else:
        silhouette_scores.append(0)

plt.figure(figsize=(10, 6))
plt.plot(K, inertia, marker='o', label='Inertia')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(K[1:], silhouette_scores[1:], marker='o', label='Silhouette Score')
plt.title('Silhouette Score for Different Numbers of Clusters')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.show()

In [None]:
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
data['Cluster'] = kmeans.fit_predict(X_scaled)

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)
data['PCA1'] = X_pca[:, 0]
data['PCA2'] = X_pca[:, 1]
data['PCA3'] = X_pca[:, 2]

fig = px.scatter_3d(data, x='PCA1', y='PCA2', z='PCA3', color='Cluster',
                    hover_data=['CustomerID', 'Frequency', 'MonetaryValue', 'Recency'],
                    title='3D Customer Segments (PCA)')
fig.update_layout(width=800, height=600)
fig.show()

In [None]:
cluster_stats = data.groupby('Cluster').agg({
    'Frequency': 'mean',
    'MonetaryValue': 'mean',
    'Recency': 'mean',
    'CustomerID': 'count',
    'ProductCategory': lambda x: x.mode()[0]
}).rename(columns={'CustomerID': 'Count'}).round(2)

plt.figure(figsize=(12, 8))
sns.scatterplot(data=data, x='Frequency', y='MonetaryValue', hue='Cluster',
                size='Recency', sizes=(50, 500), palette='deep')
plt.title('Customer Segments: Frequency vs Monetary Value (Recency as Size)')
plt.xlabel('Frequency (Number of Purchases)')
plt.ylabel('Monetary Value ($)')
plt.legend(title='Cluster')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(data=data, x='Cluster', hue='ProductCategory', palette='muted')
plt.title('Product Category Distribution by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.legend(title='Product Category')
plt.show()

In [None]:
print("\nCluster Statistics:")
print(cluster_stats)
print(f"\nSilhouette Score for k={optimal_k}: {silhouette_score(X_scaled, kmeans.labels_):.3f}")


In [None]:
high_value_sports = data[(data['ProductCategory'] == 'Sports') &
                        (data['Frequency'] > 15) &
                        (data['MonetaryValue'] > 500)].shape[0]
print(f"\nInteresting Fact: {high_value_sports} customers in the Sports category are high-frequency buyers (more than 15 purchases) with high spending (over $500).")


In [None]:
def predict_cluster(frequency, monetary_value, recency):
    new_data = np.array([[frequency, monetary_value, recency]])
    new_data_scaled = scaler.transform(new_data)
    cluster = kmeans.predict(new_data_scaled)[0]
    print(f"\nPredicted Cluster for new customer (Frequency: {frequency}, MonetaryValue: {monetary_value}, Recency: {recency}): Cluster {cluster}")

In [None]:
freq_input = widgets.FloatText(description="Frequency:", value=10)
monetary_input = widgets.FloatText(description="Monetary Value ($):", value=200)
recency_input = widgets.FloatText(description="Recency (days):", value=100)
predict_button = widgets.Button(description="Predict Cluster")
output = widgets.Output()

def on_button_clicked(b):
    with output:
        clear_output()
        predict_cluster(freq_input.value, monetary_input.value, recency_input.value)

predict_button.on_click(on_button_clicked)

print("\nEnter new customer data to predict their cluster:")
display(freq_input, monetary_input, recency_input, predict_button, output)

In [None]:
data.to_csv('clustered_customer_data_enhanced.csv', index=False)
print("\nClustered data saved to 'clustered_customer_data_enhanced.csv'")
files.download('clustered_customer_data_enhanced.csv')

print("\nSummary:")
print(f"K-means clustering with {optimal_k} clusters was performed on Frequency, Monetary Value, and Recency.")
print("Key enhancements:")
print("- Hyperparameter tuning with elbow method and silhouette score.")
print("- 3D visualization using PCA for better cluster separation.")
print("- Interactive interface for predicting cluster membership of new customers.")
print("- High-frequency, high-value Sports customers identified as a key segment.")
print("The silhouette score indicates good cluster separation, and the category distribution plot highlights product preferences.")