## Import the dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Data Collection

In [None]:
data = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
data.head()

In [None]:
#find the number of rows and columns
data.shape

In [None]:
#check the information about dataset
data.info()

In [None]:
#check for missing values
data.isnull().sum()

In [None]:
#remove unnecessary features 
data.drop(['CustomerID'],axis = 1,inplace=True)

In [None]:
data.head()

In [None]:
plt.figure(1, figsize=(15,10))
n = 0
for x in ['Age','Annual Income (k$)','Spending Score (1-100)']:
    n += 1
    plt.subplot(1,3,n)
    plt.subplots_adjust(hspace=0.5, wspace=0.5)
    sns.distplot(data[x],bins=20)
    plt.title("Distplot of {}".format(x))
plt.show()

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(y='Gender',data=data)
plt.show()

In [None]:
plt.figure(1, figsize=(15,8))
n = 0
for cols in ['Age','Annual Income (k$)','Spending Score (1-100)']:
    n += 1
    plt.subplot(1,3,n)
    plt.subplots_adjust(hspace=0.5, wspace=0.5)
    sns.violinplot(x = cols, y='Gender', data = data)
    plt.title("ViolinPlot of {}".format(cols))
plt.show()

In [None]:
#Summary of dataset
data.describe()

In [None]:
age_18_25 = data.Age[(data.Age>=18)&(data.Age<=25)]
age_26_35 = data.Age[(data.Age>=26)&(data.Age<=35)]
age_36_45 = data.Age[(data.Age>=36)&(data.Age<=45)]
age_46_55 = data.Age[(data.Age>=46)&(data.Age<=55)]
age_55above = data.Age[data.Age >=56]

agex = ['18-25','26-35','36-45','46-55','55+']
agey = [len(age_18_25.values),len(age_26_35.values),len(age_36_45.values),len(age_46_55.values),len(age_55above.values)]

plt.figure(figsize=(15,8))
sns.barplot(x=agex, y=agey, palette='mako')
plt.title('No. of Customers and Ages')
plt.xlabel('Age')
plt.ylabel('Number of Customers')
plt.show()

In [None]:
ss_1_20 = data['Spending Score (1-100)'][(data['Spending Score (1-100)']>=1)&(data['Spending Score (1-100)']<=20)]
ss_21_40 = data['Spending Score (1-100)'][(data['Spending Score (1-100)']>=21)&(data['Spending Score (1-100)']<=40)]
ss_41_60 = data['Spending Score (1-100)'][(data['Spending Score (1-100)']>=41)&(data['Spending Score (1-100)']<=60)]
ss_61_80 = data['Spending Score (1-100)'][(data['Spending Score (1-100)']>=61)&(data['Spending Score (1-100)']<=80)]
ss_81_100 = data['Spending Score (1-100)'][(data['Spending Score (1-100)']>=81)&(data['Spending Score (1-100)']<=100)]

ssx = ['1-20','21-40','41-60','61-80','81-100']
ssy = [len(ss_1_20.values),len(ss_21_40.values),len(ss_41_60.values),len(ss_61_80.values),len(ss_81_100.values)]

plt.figure(figsize=(15,8))
sns.barplot(x=ssx, y=ssy, palette='rocket')
plt.title('Spending Score')
plt.xlabel('Score')
plt.ylabel('Number of Customers having score')
plt.show()

In [None]:
al_0_30 = data['Annual Income (k$)'][(data['Annual Income (k$)']>=0)&(data['Annual Income (k$)']<=30)]
al_31_60 = data['Annual Income (k$)'][(data['Annual Income (k$)']>=31)&(data['Annual Income (k$)']<=60)]
al_61_90 = data['Annual Income (k$)'][(data['Annual Income (k$)']>=61)&(data['Annual Income (k$)']<=90)]
al_91_120 = data['Annual Income (k$)'][(data['Annual Income (k$)']>=91)&(data['Annual Income (k$)']<=120)]
al_121_150 = data['Annual Income (k$)'][(data['Annual Income (k$)']>=121)&(data['Annual Income (k$)']<=150)]

alx = ['$ 0-30,000','$ 30,001-60,000','$ 60,001-90,000','$ 90,001-120,000','$ 120,001-150,000']
aly = [len(al_0_30.values),len(al_31_60.values),len(al_61_90.values),len(al_91_120.values),len(al_121_150.values)]

plt.figure(figsize=(15,8))
sns.barplot(x=alx, y=aly, palette='Spectral')
plt.title('Annual Incomes')
plt.xlabel('Income')
plt.ylabel('Number of Customers')
plt.show()

In [None]:
#choose the AnnualIncome column and SpendingScore column

X = data.iloc[:,[2,3]].values
print(X)

In [None]:
#choose number of clusters
#WCSS --> Within clusters sum of squares
#find wcss value fro different number of clusters
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X)
    
    wcss.append(kmeans.inertia_)

In [None]:
#plot elbow garph

sns.set()
plt.plot(range(1,11),wcss,linewidth=2,color='red',marker='8')
plt.title('Elbow Point Graph')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

#### Optimum Number of clusters = 5

In [None]:
kmeans = KMeans(n_clusters=5,init='k-means++',random_state=42)

In [None]:
#return a label for each data point based on their cluster
Y = kmeans.fit_predict(X)
print(Y)

In [None]:
# Calculate Silhoutte Score
from sklearn.metrics import silhouette_samples,silhouette_score
score = silhouette_score(X, kmeans.labels_, metric='euclidean')
#
# Print the score
#
print('Silhouetter Score: %.3f' % score)

In [None]:
from yellowbrick.cluster import silhouette_visualizer
plt.figure(figsize=(10,8))
silhouette_visualizer(KMeans(n_clusters=5, random_state=42), X, colors='yellowbrick')

In [None]:
#plot all the cluster and their centroids

plt.figure(figsize=(10,8))
plt.scatter(X[Y==0,0],X[Y==0,1],s=50,c='yellow',label='Cluster 1')
plt.scatter(X[Y==1,0],X[Y==1,1],s=50,c='cyan',label='Cluster 2')
plt.scatter(X[Y==2,0],X[Y==2,1],s=50,c='orange',label='Cluster 3')
plt.scatter(X[Y==3,0],X[Y==3,1],s=50,c='red',label='Cluster 4')
plt.scatter(X[Y==4,0],X[Y==4,1],s=50,c='violet',label='Cluster 5')

#plot the centroids
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1], s=100, c='black', label='Centroids')

plt.title('Customer Groups')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.show()