**Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import missingno as msno
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import warnings 
warnings.simplefilter('ignore')

plt.style.use("dark_background")

In [None]:
df = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
df.head(10)

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
msno.dendrogram(df, figsize=(11,7))
plt.grid()

In [None]:
df.describe()

**EDA**

In [None]:
sb.countplot(df['Gender'], saturation=.66, palette='Accent')

In [None]:
income = px.bar(df, x="Annual Income (k$)", y="Spending Score (1-100)", template="plotly_dark")
income.update_layout(title_text="Spending Score against Annual Income")
income.show()

In [None]:
plt.hist(df['Age'], density=True, color="white",orientation="horizontal")
plt.xlabel("Density")
plt.ylabel("Age")
plt.title("Ages")

In [None]:
female_expen = df[df['Gender'].str.contains("Female")]
fem_exp = px.scatter(female_expen, x="Annual Income (k$)", y="Spending Score (1-100)", template="plotly_dark", color="Age")
fem_exp.update_layout(title_text="Expenditure of Female")
fem_exp.show()

In [None]:
male_expen = df[df['Gender'].str.contains("Male")]
male_exp = px.scatter(male_expen, x="Annual Income (k$)", y="Spending Score (1-100)", template="plotly_dark", color="Age",)
male_exp.update_layout(title_text="Expenditure of Male")
male_exp.show()

In [None]:
exp = px.violin(df, x="Age", y="Spending Score (1-100)", template="plotly_dark", color="Gender")
exp.update_layout(title_text="Expenditure by Ages")
exp.show()

In [None]:
sb.pairplot(df, vars=['Spending Score (1-100)','Annual Income (k$)','Age'], hue="Gender")

**Customer Segmentation**

In [None]:
income_score = df.iloc[:,[False,False,False,True,True]].values
scaler = MinMaxScaler()
scaled_income_score = scaler.fit_transform(income_score)

**Elbow Method to find optimal K cluster**

In [None]:
elbow = []
k = range(1,10)
for i in k:
  KModel = KMeans(n_clusters=i, init='k-means++')
  KModel.fit(scaled_income_score)
  elbow.append(KModel.inertia_)

px.line(k,elbow,template="plotly_dark", labels={"x":"Number of Clusters", "y":"Distortion"}, title="Elboe Method")

**Silhouette Method**

In [None]:
silhouette_score(income_score,KModel.labels_)

In [None]:
kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(income_score) for k in range(1,10)]
silhouette_scores =[silhouette_score(income_score, model.labels_) for model in kmeans_per_k[1:]]

In [None]:
plt.figure(figsize=(12,8))
plt.plot(range(2,10), silhouette_scores,"bo-", color="white", label='Silhouette Curve')
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Curve")
plt.grid(linestyle='--')
plt.title("Silhouette Curve for optimal number of clusters")

k = np.argmax(silhouette_scores)+2

plt.axvline(x=k, c="yellow", linestyle='--', linewidth=3, label='Optimal number of clusters({})'.format(k))
plt.scatter(k, silhouette_scores[k-2], s=400)
plt.legend(shadow=True)
plt.show()

In [None]:
print("Optimal number of cluster is : {}".format(k))

In [None]:
plt.figure(figsize=(12,8))
def decision_boundaries(clusterer, x, resolution = 2000, show_centroids = True, show_xlabels = True, show_ylabels = True, shadow=True):
  mins = income_score.min(axis=0) - 0.1
  maxs = income_score.max(axis=0) + 0.1
  plt.style.use("fivethirtyeight")

  xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
                       np.linspace(mins[1], maxs[1], resolution))
  Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])
  Z = Z.reshape(xx.shape)
  plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]), cmap='RdYlBu_r')
  plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]), linewidths=1, colors='k')
  plot_data(income_score)
  if show_centroids:
    plot_centroids(clusterer.cluster_centers_)
  if show_xlabels:
    plt.xlabel("$x_1$", fontsize=14)
  else:
    plt.tick_params(labelbottom=False)  
  if show_ylabels:
    plt.ylabel("$x_2$", fontsize=14, rotation=0)
  else:
    plt.tick_params(labelleft=False)  


def plot_data(income_score):
  plt.plot(income_score[:,0], income_score[:,1], 'k.')

def plot_centroids(centroids, weights=None, circle_color='red', cross_color='k', shadow=True):
  if weights is not None:
    centroids = centroids[weights>weights.max() / 10]
    plt.scatter(centroids[:,0], centroids[:,1], marker='o', s=50, linewidths=9 ,color=circle_color, zorder = 10, alpha=0.9, shadow=True)
    plt.scatter(centroids[:,0], centroids[:,1], marker='x', s=80,linewidths=60, color=cross_color, zorder = 11, alpha=1, shadow=True)


decision_boundaries(kmeans_per_k[k-1], income_score)

plt.show()

In [None]:
kmeanModel = KMeans(n_clusters=4,init='k-means++',max_iter=300,n_init=10,random_state=0)
y_kmeans= kmeanModel.fit_predict(income_score)
plt.figure(figsize=(8,8))
plt.scatter(income_score[y_kmeans == 0, 0], income_score[y_kmeans == 0, 1], s = 60, c = 'g', label = 'Cluster 1')
plt.scatter(income_score[y_kmeans == 1, 0], income_score[y_kmeans == 1, 1], s = 60, c = 'b', label = 'Cluster 2')
plt.scatter(income_score[y_kmeans == 2, 0], income_score[y_kmeans == 2, 1], s = 60, c = 'r', label = 'Cluster 3')
plt.scatter(income_score[y_kmeans == 3, 0], income_score[y_kmeans == 3, 1], s = 60, c = 'yellow', label = 'Cluster 4')
plt.scatter(kmeanModel.cluster_centers_[:, 0], kmeanModel.cluster_centers_[:, 1], s = 80, c = 'black', marker='x', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()