# Importing the required libraries.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from scipy.stats import norm, boxcox
from scipy import stats

# Loading the dataset.

In [None]:
df = pd.read_csv("/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")
df.head()

# Dataset Information.

In [None]:
# checking no of rows and column
# Type of column
# Names of column
df.info()

In [None]:
# Cheking the statistical attributes of datatset.
df.describe()

In [None]:
#Checking for null values.

In [None]:
df.isnull().sum()

In [None]:
#Checking for outlier by using boxplot
sns.boxplot(y="Annual Income (k$)", data=df)
# Outliers are there

In [None]:
# Checking outlier in spending score column.
sns.boxplot(y="Spending Score (1-100)", data=df)
#No outlier

In [None]:
# Checking for outlier in Age column.
sns.boxplot(y="Age", data=df)
#No outlier

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.distplot(df["Annual Income (k$)"], fit=norm, color="orange")
plt.title("Annual Income (k$) Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["Annual Income (k$)"], plot = plt)
plt.show()

No need to correct for skewness because the data is normal distributed.

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.distplot(df["Spending Score (1-100)"], fit=norm, color="orange")
plt.title("Spending Score (1-100) Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["Spending Score (1-100)"], plot = plt)
plt.show()

No need to correct for skewness because the data is normal distributed.

# Checking for outliers in "Annual Income"

In [None]:
Q1 = df['Annual Income (k$)'].quantile(0.25)
Q3 = df['Annual Income (k$)'].quantile(0.75)
IQR = Q3 - Q1

In [None]:
low_lim = Q1 - 1.5 * IQR
up_lim = Q3 + 1.5 * IQR
print('low_limit is', low_lim)
print('up_limit is', up_lim)

In [None]:
outlier = []
for x in df['Annual Income (k$)']:
    if ((x > up_lim) or (x < low_lim)):
         outlier.append(x)
print('Outlier in the dataset is', outlier)

Replacing the outlier with median

In [None]:
# Data points in our dataset is not large so we can use median to replace the outlier.
df.loc[df['Annual Income (k$)'] >= 137, 'Annual Income (k$)'] = 61.5

In [None]:
df

In [None]:
# Cheking for outlier after replceing with median.
sns.boxplot(y="Annual Income (k$)", data=df)
#No outlier after replacing it with mean.

In [None]:
# Plotting a scatter plot between annual income and spending score.
# Which will help in deciding the number of clusters.

In [None]:
sns.scatterplot(x="Annual Income (k$)",
                    y="Spending Score (1-100)",
                    data=df)

Making clusters on "Annual Income" and "Spending Score".

In [None]:
k = df.iloc[:,[3,4]].values
k

Deciding the number of clusters with using the elbow curve.
We will use the k numbers of clusters and choose the optimum one.

In [None]:
# finding wcss (Within Clusters Sum of Squares) value for different number of clusters.
# WCSS is used to find the number of clusters.
wcss = []

for i in range(1,11):
  kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
  kmeans.fit(k)

  wcss.append(kmeans.inertia_)

In [None]:
# Plotting the elbow curve
sns.set()
plt.plot(range(1,11), wcss)
plt.title('The Elbow Point Graph')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

From the elbow point graph and the scatter plot which we plotted before , we could take 5 as the number of clusters.

# Training the K-means model.

For better understanding, let's take number of cluster as 4 and see what happens.

In [None]:
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=0)
# return a label for each data point based on their cluster
Y = kmeans.fit_predict(k)

print(Y)

In [None]:
# plotting all the clusters and their Means

plt.figure(figsize=(8,8))
plt.scatter(k[Y==0,0], k[Y==0,1], s=50, c='cyan', label='Cluster 1')
plt.scatter(k[Y==1,0], k[Y==1,1], s=50, c='gold', label='Cluster 2')
plt.scatter(k[Y==2,0], k[Y==2,1], s=50, c='coral', label='Cluster 3')
plt.scatter(k[Y==3,0], k[Y==3,1], s=50, c='olive', label='Cluster 4')

# Means
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], s=150, c='brown', label='Means')

plt.title('Groups')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.show()

From the clustering with 4 cluster we can see that we actually need 5 numbers of cluster for better grouping.

Now taking number of clusters as 5 and see the grouping.

In [None]:
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=0)
# return a label for each data point based on their cluster
Y = kmeans.fit_predict(k)

print(Y)

In [None]:
# Plotting all the clusters and their Means

plt.figure(figsize=(8,8))
plt.scatter(k[Y==0,0], k[Y==0,1], s=50, c='cyan', label='Cluster 1')
plt.scatter(k[Y==1,0], k[Y==1,1], s=50, c='gold', label='Cluster 2')
plt.scatter(k[Y==2,0], k[Y==2,1], s=50, c='coral', label='Cluster 3')
plt.scatter(k[Y==3,0], k[Y==3,1], s=50, c='olive', label='Cluster 4')
plt.scatter(k[Y==4,0], k[Y==4,1], s=50, c='orange', label='Cluster 5')

# plot the means
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], s=150, c='brown', label='Means')

plt.title('Groups')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.show()

# Conclusion

By clustering we can target the potential customers of the mall and use it to increase the profit.
And we can also think of different strategy for the each clusters and give them offers and run advertisement according to the clustering of customers.