In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
cust=pd.read_csv("../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")

In [None]:
cust.head(5)

In [None]:
cust.describe()

In [None]:
cust["Gender"].value_counts()

In [None]:
%matplotlib inline
sns.distplot(cust["Spending Score (1-100)"])
plt.xlim(0,100)
#plt.xticks(ticks=[0,10,20,30,40,50,60,70,80,90,100])
plt.title("Spending Score Distribution")

In [None]:
sns.distplot(cust["Age"],color="pink")
plt.xlim(15,70)
plt.title("Age Distribution of Customers")

In [None]:
%matplotlib inline
sns.scatterplot(cust["Age"],cust["Spending Score (1-100)"],color="green",hue=cust["Gender"])
plt.title(" Spending Score vs Age")

# Higher spending scores are assigned to indvidual between the age of 20 to 40. Age is a very important factor in determining clusters

In [None]:
sns.scatterplot(x=cust["Annual Income (k$)"],y=cust["Spending Score (1-100)"],hue=cust["Gender"])

# Annual Income does not have a huge impact on the spending score

In [None]:
cust=cust.drop('CustomerID',axis=1)
#cust.head(5)

In [None]:
cust.head(5)

In [None]:
corr=cust.corr()

In [None]:
sns.heatmap(data=corr,square=True,cmap="viridis",annot=True)

# Data Processing

In [None]:
cust=pd.get_dummies(cust,columns=["Gender"],drop_first=True)
cust.head(5)

In [None]:
cust.columns=["Age","Annual Income","Spending Score","Gender"]
cust.head(5)

In [None]:
cust_transform=cust.copy()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
scaler=StandardScaler()

In [None]:
cust_transform["Age"]=scaler.fit_transform(cust["Age"].values.reshape(-1,1))
cust_transform["Annual Income"]=scaler.fit_transform(cust["Annual Income"].values.reshape(-1,1))
cust_transform["Spending Score"]=scaler.fit_transform(cust["Spending Score"].values.reshape(-1,1))

In [None]:
sns.scatterplot(cust_transform["Age"],cust_transform["Spending Score"],hue=cust_transform["Gender"],color="pink")
plt.title("Tranformed Data (Spending Score vs Age)")

In [None]:
sns.scatterplot(cust_transform["Annual Income"],cust_transform["Spending Score"],hue=cust_transform["Gender"],color="pink")
plt.title("Tranformed Data (Annual Income vs Age)")

In [None]:
wcss=[]
for i in range(1,12):
    kmeans=KMeans(n_clusters=i).fit(cust_transform)
    wcss.append(kmeans.inertia_)
    

sns.lineplot(x=range(1,12),y=wcss,color="red")
plt.title("The Elbow method")

In [None]:
kmeans_model=KMeans(n_clusters=5).fit(cust_transform)

In [None]:
kmeans_cluster=kmeans_model.predict(cust_transform)
kmeans_cluster

In [None]:
from sklearn.metrics import silhouette_score
silhouette_coefficients=[]
for i in range(2,12):
     kmeans = KMeans(n_clusters=i)     
     kmeans.fit(cust_transform)
     score = silhouette_score(cust_transform, kmeans.labels_)
     silhouette_coefficients.append(score)

sns.lineplot(x=range(2,12),y=silhouette_coefficients,color="green")

# Based on the analysis we will go for 6 clusters in the dataset

In [None]:
kmeans_model2=KMeans(n_clusters=6).fit(cust_transform)
kmeans_cluster2=kmeans_model2.predict(cust_transform)

In [None]:
cust=pd.concat([cust,pd.DataFrame({"Cluster Number":kmeans_cluster2})],axis=1)
cust

In [None]:
cust["Cluster Number"].value_counts()

# Data Visualisation of Clusters

In [None]:
plt.figure(figsize=[10,10])
sns.scatterplot(x=cust["Age"],y=cust["Spending Score"],hue=cust["Cluster Number"],palette="Set1",legend="full",s=100)

In [None]:
plt.figure(figsize=[10,10])
sns.scatterplot(x=cust["Annual Income"],y=cust["Spending Score"],hue=cust["Cluster Number"],palette="Dark2",legend="full",s=100)

In [None]:
cust.groupby("Cluster Number").mean()["Spending Score"].plot(kind="bar",color="green")

In [None]:
cust.groupby("Cluster Number").mean()[["Age","Annual Income"]].plot(kind="bar")

# Cluster 2 & Cluster 4 are the clusters that have high spending score. Cluster 2 largely comprises of peoeple who have high income & are mid-age. Evaluating Cluster 4 has pretty much the same age but lower income. Both clusters indicate that age is more closely related with spendings score than Income.