**MALL CUSTOMERS CLUSTERING**

<img src= "https://thumbs.gfycat.com/ChillyAshamedIrishredandwhitesetter-max-1mb.gif"  style='width: 500px;'>

In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#reading dataset
ds = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
ds.head()

In [None]:
#getting dimensions
ds.shape

In [None]:
ds.isnull().any()

**NO MISSING VALUES**

**VISUALIZING DATESET**

In [None]:
plt.figure(figsize=(14,6))
plt.subplot(1, 2, 1)
sns.set(style = 'whitegrid')
sns.distplot(ds['Annual Income (k$)'])
plt.title('Distribution of Annual Income', fontsize = 20)
plt.xlabel('Range of Annual Income')
plt.ylabel('Count')


plt.subplot(1, 2, 2)
sns.set(style = 'whitegrid')
sns.distplot(ds['Age'], color = 'red')
plt.title('Distribution of Age', fontsize = 20)
plt.xlabel('Range of Age')
plt.ylabel('Count')
plt.show()

* people earning between 50-75 is maximum.
* people with ages 20-40 are the most frequent mall customers.

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x = 'Gender',data = ds)

**FEMALE CUSTOMERS ARE MORE AS COMPARED TO MALE ONES**

In [None]:
plt.figure(figsize=(20,8))
sns.countplot(ds['Spending Score (1-100)'])

In [None]:
#Pairplot for the Data
sns.pairplot(ds,hue = 'Gender')
plt.subplots_adjust(hspace = 0.8)

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(ds.corr(), cmap = 'Wistia', annot = True)
plt.title('Heatmap for the Data', fontsize = 20)
plt.show()

**NO SIGNIFICANT CORRELATION**

In [None]:
sns.jointplot(x = 'Age',y = 'Spending Score (1-100)',data = ds, hue = 'Gender')

In [None]:
plt.figure(figsize=(10,6))
sns.stripplot(x = 'Gender',y = 'Spending Score (1-100)',data = ds,)

# APPLYING CLUSTERING

**1. K-MEANS CLUSTERING**

In [None]:
#defining dependent variable
x = ds.iloc[:,[3,4]].values

In [None]:
#ELBOW METHOD TO FIND OPTIMAL NUMBER OF CLUSTERS
from sklearn.cluster import KMeans
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++')
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)

NUMBER OF CLUSTERS = 5

In [None]:
#training model
kmeans = KMeans(n_clusters = 5, init = 'k-means++')
y_pred = kmeans.fit_predict(x)

In [None]:
#visualizing clusters
plt.figure(figsize=(12,8))
plt.scatter(x[y_pred == 0,0],x[y_pred == 0,1],label = 'Cluster-1', s = 100)
plt.scatter(x[y_pred == 1,0],x[y_pred == 1,1],label = 'Cluster-2', s = 100)
plt.scatter(x[y_pred == 2,0],x[y_pred == 2,1],label = 'Cluster-3', s = 100)
plt.scatter(x[y_pred == 3,0],x[y_pred == 3,1],label = 'Cluster-4', s = 100)
plt.scatter(x[y_pred == 4,0],x[y_pred == 4,1],label = 'Cluster-5', s = 100)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:, 1], s = 50, c = 'black' , label = 'centeroid')
plt.legend()
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')

**2. HIERARCHICAL CLUSTERING(AGGLOMERATIVE)**


In [None]:
#defining dependent variable
x = ds.iloc[:,[3,4]].values

In [None]:
#getting optimal number of clusters using dendogram
from scipy.cluster.hierarchy import dendrogram, linkage
plt.figure(figsize = (12,6))
dendo = dendrogram(linkage(x,method = 'ward'))
plt.title('Dendrogam', fontsize = 20)
plt.xlabel('Customers')
plt.ylabel('Ecuclidean Distance')

plt.show()

**OPTIMAL CLUSTERS = 3,5**

**BUT I'LL BE TAKING 5**

In [None]:
from sklearn.cluster import AgglomerativeClustering
ac = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')
y_pred = ac.fit_predict(x)

In [None]:
#visualizing clusters
plt.figure(figsize=(12,8))
plt.scatter(x[y_pred == 0,0],x[y_pred == 0,1],label = 'Cluster-1', s = 100)
plt.scatter(x[y_pred == 1,0],x[y_pred == 1,1],label = 'Cluster-2', s = 100)
plt.scatter(x[y_pred == 2,0],x[y_pred == 2,1],label = 'Cluster-3', s = 100)
plt.scatter(x[y_pred == 3,0],x[y_pred == 3,1],label = 'Cluster-4', s = 100)
plt.scatter(x[y_pred == 4,0],x[y_pred == 4,1],label = 'Cluster-5', s = 100)
plt.legend()
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')