In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf

In [None]:
data = pd.read_csv("/content/customer_segmentation_data.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.columns

In [None]:
data['gender'].unique()

In [None]:
data['preferred_category'].unique()

In [None]:
print("Total categories in gender:", data['gender'].value_counts(), "\n")
print("Total categories in Prefered category:", data['preferred_category'].value_counts())


In [None]:
data = data.drop(['id'],axis= 1)

distribution plot for age, income, spending_score

In [None]:
plt.figure(figsize=(15,5))

fig, axes = plt.subplots(1,3, figsize=(16,4))

# plot fo age:
sns.histplot(data['age'], kde = True, color ='b', bins =30, ax= axes[0])
axes[0].set_title('Age Distribution')
axes[0].set_xlabel('Age')
axes[0].set_ylabel('count')

# plot for income
sns.histplot(data['income'], kde = True, color ='b', bins =20, ax= axes[1])
axes[1].set_title('Income Distribution')
axes[1].set_xlabel('Income')
axes[1].set_ylabel('count')

#plot for spending score
sns.histplot(data['spending_score'], kde = True, color ='b', bins =20, ax= axes[2])
axes[2].set_title('Spending score Distribution')
axes[2].set_xlabel('Spending score')
axes[2].set_ylabel('count')

plt.show()

In [None]:
gender_counts = data['gender'].value_counts()

plt.figure(1, figsize=(15,5))
sns.countplot(y= 'gender', hue ='gender', data= data, palette = 'Set2')
for i in data['gender'].unique():
  count = gender_counts[i]
  percentage = (count/data.shape[0]) * 100
  plt.text(count - 12, i, f'{percentage: .1f}%', va='top', color = 'white')

plt.grid(False)
plt.show()

In [None]:
plt.figure(1, figsize=(15,6))
n=0
for cols in ['age', 'income', 'spending_score']:
  n += 1
  plt.subplot(1,3,n)
  sns.set(style = 'whitegrid')
  plt.subplots_adjust(hspace=0.5, wspace=0.5)
  sns.violinplot(x= cols, y= 'gender',data=data,hue = 'gender')
  plt.ylabel('Gender' if n==1 else '')
  plt.title('Violin Plot')
plt.show()

In [None]:
age_18_25 = data.age[(data.age >= 18) & (data.age <= 25)]
age_26_35 = data.age[(data.age >= 26) & (data.age <= 35)]
age_36_45 = data.age[(data.age >= 36) & (data.age <= 45)]
age_46_55 = data.age[(data.age >= 46) & (data.age <= 55)]
age_gt_55 = data.age[data.age >=56]

In [None]:
agex= ['18-25', '26-35', '36-45','46-55','55+']
agey = [len(age_18_25.values), len(age_26_35.values), len(age_36_45.values), len(age_46_55.values), len(age_gt_55.values)]

In [None]:
plt.figure(figsize = (10,6))
sns.barplot(x=agex, y=agey, palette = 'mako')
plt.title('Number of Customer and Ages')
plt.xlabel('Age')
plt.ylabel('Number of Customer')
plt.show()

In [None]:
sns.relplot(x="income", y = "spending_score", data =data)

In [None]:
data.head()

In [None]:
data =pd.get_dummies(data, columns =['gender'], dtype = int)

In [None]:
data.head()

In [None]:
data = data.drop(['preferred_category'], axis=1)

In [None]:
data.head()

In [None]:
# scale the featuers
from sklearn.preprocessing import StandardScaler
scaler =StandardScaler()
data_scaled = scaler.fit_transform(data)

In [None]:
from sklearn.cluster import KMeans

In [None]:
# using elbow method to find the optimal numberof clusters
wcss = []
for i in range(1,11):
  kmeans = KMeans(n_clusters=i, init='k-means++', max_iter = 300, n_init=10, random_state = 42)
  kmeans.fit(data_scaled)
  wcss.append(kmeans.inertia_)


In [None]:
# plotting the elbow curve
plt.figure(figsize=(10,6))
plt.plot(range(1,11),wcss, marker = '8', color = 'blue')
plt.title('The Elbow Curve')
plt.xlabel('Num of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
optimal_clusters = 4
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter = 300, n_init=10, random_state = 42)
data['cluster'] = kmeans.fit_predict(data_scaled)

print(data.head())

In [None]:
# plotting the clusters based on income and spending_score

plt.figure(figsize=(10,6))
plt.scatter(data[data['cluster']==0]['income'], data[data['cluster']==0]['spending_score'],s= 100, c='red', label='Cluster 1')
plt.scatter(data[data['cluster']==1]['income'], data[data['cluster']==1]['spending_score'],s= 100, c='blue', label='Cluster 2')
plt.scatter(data[data['cluster']==2]['income'], data[data['cluster']==2]['spending_score'],s= 100, c='green', label='Cluster 3')
plt.scatter(data[data['cluster']== 3]['income'], data[data['cluster']==3]['spending_score'],s= 100, c='cyan', label='Cluster 4')
plt.scatter(kmeans.cluster_centers_[:,1], kmeans.cluster_centers_[:,2], s= 300, c='yellow', label='Centroids')

plt.title('Clusters based on Income and Spending Score')
plt.xlabel('Income')
plt.ylabel('Spending Score')
plt.legend()
plt.show()

In [None]:
cluster_summary = data.groupby('cluster').mean()
print(cluster_summary)