In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [None]:
data = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
data.head()

# Plotting Annual Income and Spending Score

In [None]:
plt.scatter(data['Annual Income (k$)'],data['Spending Score (1-100)'])
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')

# Elbow method to determine number of clusters in K-means

In [None]:
segment = data.iloc[:, [3, 4]].values
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++',max_iter = 500, n_init = 15)
    kmeans.fit(segment)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss, marker = 'o')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.grid()
plt.show()

# K-means clustering

Number of clusters will be 5. As, after 5 clusters the arc is almost linear.

In [None]:
kmeans = KMeans(5)
kmeans.fit(segment)
data['cluster'] = kmeans.fit_predict(segment)
# pd.set_option('display.max_rows', data.shape[0]+1)
data

# Plotting clusters

In [None]:
plt.scatter(data['Annual Income (k$)'],data['Spending Score (1-100)'],c=data['cluster'],cmap='rainbow')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')

# Optional Analysis

* The Yellow and Green groups are having good spending score, whereas Blue's performance is average.
* Two groups which need to be focused on are Purple and Red.
* I am taking Purple cluster for demonstration as they have the potential but they are less inclined towards spending.

In [None]:
highIncome_LowSpend = data.drop(['CustomerID'],axis=1)
highIncome_LowSpend.drop(highIncome_LowSpend[highIncome_LowSpend['cluster'] != 0].index, inplace=True)
highIncome_LowSpend['Gender'] = highIncome_LowSpend['Gender'].map({'Male':0,'Female':1})
highIncome_LowSpend.describe()

# Purple group gender distribution

We can see that there are more men than woment in this group.

In [None]:
plt.figure(figsize=(8,8))
highIncome_LowSpend.Gender.value_counts().plot(kind='pie', autopct='%.2f%%', shadow=True,explode=(0,0.04))
plt.legend()

# Purple group Age distribution

The target age should be between 34 and 48. As we have highest number of people in this age group. Various curated offers or age band specific produts can be used to improve the spending score.

In [None]:
sns.histplot(data=highIncome_LowSpend, x='Age', hue='Gender', binwidth=2, multiple='stack',palette='Pastel1').set_title('Age distribution')