In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from scipy.stats import norm, boxcox
from scipy import stats

In [None]:
df = pd.read_csv("/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")
df.head()

In [None]:
# Cheking no. of rows and columns
# name of columns.
# type of each column
df.info()


In [None]:
# checking 5 number summary.
df.describe()

In [None]:
# Checking null
df.isnull().sum()

# checking outliers

In [None]:
# I am using blox plot to identify the outliers

In [None]:
sns.boxplot(y="Annual Income (k$)", data=df)

**# treating the outliers.**

In [None]:
Q1 = df['Annual Income (k$)'].quantile(0.25)
Q3 = df['Annual Income (k$)'].quantile(0.75)
IQR = Q3 - Q1

In [None]:
low_lim = Q1 - 1.5 * IQR
up_lim = Q3 + 1.5 * IQR
print('low_limit is', low_lim)
print('up_limit is', up_lim)

In [None]:
outlier = []
for x in df['Annual Income (k$)']:
    if ((x> up_lim) or (x<low_lim)):
         outlier.append(x)
print(' outlier in the dataset is', outlier)

In [None]:
len(outlier)
# there is only 2 outlier value

In [None]:
# We are replacing outlier with median.
# beacuse the total no. of observation are low.
# so we can not drop those outliers value.
# median are not affected by outlier.

In [None]:
#replacing outlier value which is 137 with median=61.5
df.loc[df['Annual Income (k$)'] >= 137, 'Annual Income (k$)'] = 61.5

In [None]:
#outlier is removed now
sns.boxplot(y="Annual Income (k$)", data=df)

In [None]:
#checking outlier for Spending Score
sns.boxplot(y="Spending Score (1-100)", data=df)
# no outlier

In [None]:
#checking outlier for Age
sns.boxplot(y="Age", data=df)
# no outlier

# checking skewness for spending score

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.distplot(df["Spending Score (1-100)"], fit=norm, color="orange")
plt.title("Spending Score Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["Spending Score (1-100)"], plot = plt)
plt.show()

In [None]:
# no need for skewness correction

**# checking skewness for Annual income**

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.distplot(df["Annual Income (k$)"], fit=norm, color="orange")
plt.title("Annual Income (k$) Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["Annual Income (k$)"], plot = plt)
plt.show()

In [None]:
# no need for skewness correction

# Visualizing the annual income vs spending score.
# This will help us in deciding how many cluster we need.

In [None]:
sns.scatterplot(x="Annual Income (k$)",
                    y="Spending Score (1-100)",
                    data=df)

In [None]:
# making cluster on these two features 1. annual income 2. spending score

In [None]:
clus = df.iloc[:,[3,4]].values

In [None]:
# first we need to decide the number of cluster
# for this we use elbow curve.
# by seeing the elbow curve and ealier scater plot we can easily decide
#how many cluster we accuanly need.

In [None]:
# wcss= within cluster sum of sqaures
# we need to calculate the wcss beucase we need the elbow curve
# to identify the no. of cluster requirement.
wcss = []

for i in range(1,11):
  kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
  kmeans.fit(clus)

  wcss.append(kmeans.inertia_)

In [None]:
# draw elbow graph

fig = plt.figure()
ax = fig.add_subplot(111, xlabel="Number of Clusters", ylabel="WCSS")
plt.plot(range(1,11), wcss)
plt.title('The Elbow Point Graph')

plt.show()

In [None]:
# optimum value of cluster is 6
# we choose the no. of cluster after which the decrease in wcss become low or almost constant.

In [None]:
# training the k means algo
kmeans = KMeans(n_clusters=6, init='k-means++', random_state=42)

# return a label for each data point based on their cluster
Y = kmeans.fit_predict(clus)

print(Y)

In [None]:
print(np.unique(Y))
#no. of cluster

In [None]:

plt.figure(figsize=(10,8))
plt.scatter(clus[Y==0,0], clus[Y==0,1], s=60, c='orange', label='Cluster 1')
plt.scatter(clus[Y==1,0], clus[Y==1,1], s=60, c='red', label='Cluster 2')
plt.scatter(clus[Y==2,0], clus[Y==2,1], s=60, c='lime', label='Cluster 3')
plt.scatter(clus[Y==3,0], clus[Y==3,1], s=60, c='coral', label='Cluster 4')
plt.scatter(clus[Y==4,0], clus[Y==4,1], s=60, c='blue', label='Cluster 5')

# plot the centroids
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], s=100, c='cyan', label='Centroids')

plt.title('customer_segments')
plt.xlabel('Annual_Income')
plt.ylabel('Spending_Score')
plt.show()

In [None]:
# In the above graph we have only 5 cluster and we accutaly write the code for 6 cluster.
# so we can say that we only need 5 cluster.
# now we will optimize our code to get 5 cluster.

In [None]:
# training the k means algo
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)

# return a label for each data point based on their cluster
Y = kmeans.fit_predict(clus)

print(Y)

In [None]:
print(np.unique(Y))
#no. of cluster

In [None]:

plt.figure(figsize=(10,8))
plt.scatter(clus[Y==0,0], clus[Y==0,1], s=60, c='orange', label='Cluster 1')
plt.scatter(clus[Y==1,0], clus[Y==1,1], s=60, c='red', label='Cluster 2')
plt.scatter(clus[Y==2,0], clus[Y==2,1], s=60, c='lime', label='Cluster 3')
plt.scatter(clus[Y==3,0], clus[Y==3,1], s=60, c='coral', label='Cluster 4')
plt.scatter(clus[Y==4,0], clus[Y==4,1], s=60, c='blue', label='Cluster 5')

# plot the centroids
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], s=200, c='black')

plt.title('customer_segments')
plt.xlabel('Annual_Income')
plt.ylabel('Spending_Score')
plt.show()

# Conclusion

In [None]:
# we can clearly see the 5 cluster with distinct sepration between each datapoints.
# using this info we can run target advertisment for diffrent group with diffrent need.
# to increase our sales and profit.
# we can also decide to provide coupon and we can messure the effect of coupon on sales in diffrent groups.

#done