In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from scipy import stats

In [None]:
df = pd.read_csv("/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
#df.drop("Gender",axis=1,inplace=True)
#df.head()

In [None]:
# summary of datasets
df.describe()

In [None]:
# Checking null data
df.isnull().sum()

In [None]:
#Correlation of features with each other
corr=df.corr()
print(corr)

In [None]:
plt.figure(figsize=(10,4))
ax=sns.heatmap(corr,annot=True,cmap="coolwarm")

# checking outliers

In [None]:
# I will be using  box plot for  identifying the outliers

In [None]:
sns.boxplot(y="Annual Income (k$)",data=df)

**# treating the outliers.**

In [None]:
q1 = df['Annual Income (k$)'].quantile(0.25)
q3 = df['Annual Income (k$)'].quantile(0.75)
IQR = q3 - q1

In [None]:
low_lim = q1 - 1.5 * IQR
up_lim = q3 + 1.5 * IQR
print('low_limit is', low_lim)
print('up_limit is', up_lim)

In [None]:
outlier =[]
for x in df["Annual Income (k$)"]:
    if ((x>up_lim) or (x<low_lim)):
         outlier.append(x)
print("outlier in the dataset is",outlier)

In [None]:
len(outlier)

In [None]:
# median are not affected by outlier so we will replace the outlier with median.
#because of less no. of outliers.

In [None]:
#replacing outlier value which is 137 with median=61.5
df.loc[df['Annual Income (k$)'] >= 137, 'Annual Income (k$)'] = 61.5

In [None]:
#After the removal of outliers
sns.boxplot(y="Annual Income (k$)", data=df)

In [None]:
#checking the outlier for Spending Score
sns.boxplot(y="Spending Score (1-100)", data=df)


In [None]:
#checking outlier for Age
sns.boxplot(y="Age", data=df)

In [None]:
#No outliers for age and spending-score(1-100)

#checking skewness for spending score

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.histplot(df["Spending Score (1-100)"],  color="orange")
plt.title("Spending Score Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["Spending Score (1-100)"], plot = plt)
plt.title("Splendid Score Probability PLot", color="darkred")
plt.show()

**# checking skewness for Annual income**

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.histplot(df["Annual Income (k$)"], color="orange")
plt.title("Annual Income (k$) Distplot", color = "darkred")
plt.subplot(1,2,2)
stats.probplot(df["Annual Income (k$)"], plot = plt)
plt.title("Annual Income (k$) Probability Plot", color = "darkred")
plt.show()

# Visualizing the annual income vs spending score for deciding the no. of cluster .

In [None]:
sns.scatterplot(x="Annual Income (k$)",y="Spending Score (1-100)",data=df)

In [None]:
# forming of  cluster on these two features 1. annual income 2. spending score

In [None]:
Clus = df.iloc[:,[3,4]].values
print(Clus)

In [None]:
# first we need to decide the number of cluster so we will use elbow curve.
# visualizing the elbow curve and  scatter plot we can easily decide
#how many cluster we accuratly need.

In [None]:
# wcss= within cluster sum of sqaures
# we need to calculate the wcss beucase we need the elbow curve
# to identify the no. of cluster requirement.
wcss = []

for i in range(1,11):
  kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
  kmeans.fit(Clus)

  wcss.append(kmeans.inertia_)

print(wcss)

In [None]:
# draw elbow graph

fig = plt.figure(figsize=(8,4))
plt.plot(range(1,11), wcss)
plt.title('The Elbow Point Graph')
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()

In [None]:
# optimum value of cluster is 6
# we choose the no. of cluster after which the decrease in wcss become low or almost constant.

In [None]:
# training the k means algo
kmeans = KMeans(n_clusters=6, init='k-means++', random_state=42)

# return a label for each data point based on their cluster
y_kmeans = kmeans.fit_predict(Clus)

print(y_kmeans)

In [None]:
print(np.unique(y_kmeans))
#no. of cluster

In [None]:
cluster_centers=kmeans.cluster_centers_
cluster_centers

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(Clus[y_kmeans==0,0], Clus[y_kmeans==0,1], s=60, c='orange', label='Cluster 1')
plt.scatter(Clus[y_kmeans==1,0], Clus[y_kmeans==1,1], s=60, c='red', label='Cluster 2')
plt.scatter(Clus[y_kmeans==2,0], Clus[y_kmeans==2,1], s=60, c='lime', label='Cluster 3')
plt.scatter(Clus[y_kmeans==3,0], Clus[y_kmeans==3,1], s=60, c='coral', label='Cluster 4')
plt.scatter(Clus[y_kmeans==4,0], Clus[y_kmeans==4,1], s=60, c='blue', label='Cluster 5')

# plot the centroids
plt.scatter(cluster_centers[:,0],cluster_centers[:,1], s=100, c='cyan', label='Centroids')

plt.title('customer_segments')
plt.xlabel('Annual_Income')
plt.ylabel('Spending_Score')
plt.show()

In [None]:
# so we can say that we only need 5 cluster as only 5 are required.
# now we have to  optimize our code to get only 5 cluster.

In [None]:
# training the data with k means algorithms
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)

# return a label for each data point based on their cluster
y_kmeans = kmeans.fit_predict(Clus)

print(y_kmeans)

In [None]:
labels=kmeans.labels_
print(labels)

In [None]:
df["Clus_km"]=labels
df.head()

In [None]:
df.groupby("Clus_km").mean()

In [None]:
print(np.unique(labels))

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(Clus[labels==0,0], Clus[labels==0,1], s=60, c='orange', label='Cluster 1')
plt.scatter(Clus[labels==1,0], Clus[labels==1,1], s=60, c='red', label='Cluster 2')
plt.scatter(Clus[labels==2,0], Clus[labels==2,1], s=60, c='lime', label='Cluster 3')
plt.scatter(Clus[labels==3,0], Clus[labels==3,1], s=60, c='coral', label='Cluster 4')
plt.scatter(Clus[labels==4,0], Clus[labels==4,1], s=60, c='blue', label='Cluster 5')

# plot the centroids
plt.scatter(cluster_centers[:,0],cluster_centers[:,1], s=200, c='black',label="Centroids")

plt.title('customer_segments')
plt.xlabel('Annual_Income')
plt.ylabel('Spending_Score')
plt.show()

In [None]:
new_input=[[20.0,75]]
new_output=kmeans.predict(new_input)
print(new_input,new_output)

# Conclusion

In [None]:
# 5 Cluster can be easily be seen  with distinct sepration between each datapoints.
# With the help of  this information  we can run target advertisment for diffrent group with diffrent need.
# to increase our sales and profit.
# we can also decide to provide coupon and we can messure the effect of coupon on sales in diffrent groups.

#done