# Importing Libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import norm, boxcox
from scipy import stats
from pandas_profiling import ProfileReport
import plotly.express as px
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as sch
import warnings
warnings.simplefilter(action='ignore', category=Warning)


# Loading Dataset

In [None]:
dataset = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv', index_col=0)


# Exploratory Data Analysis


## 1) Using Manual Methods

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.describe()


In [None]:
dataset.info()

In [None]:
dataset.isnull().values.any()


## Plotting Count for Annual Income

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(20, 7))
sns.countplot(x="Annual Income (k$)", data=dataset, palette='husl');


## Finding Correlation among the variables

In [None]:
plt.figure(figsize=(10, 7))
matrix = np.triu(dataset.corr())
sns.heatmap(dataset.corr(), annot=True,linewidth=.8, mask=matrix, cmap="rocket");


## Plotting Distribution for each Column in dataset

In [None]:
def distributionPlot(columnName):
    if not columnName == 'Gender':
        plt.figure()
        sns.distplot(dataset[columnName], color="lightcoral", rug=True);


In [None]:
for column in dataset.columns:
    distributionPlot(column)

## Distribution of Males and Females in dataset

In [None]:
values = dataset['Gender'].value_counts()
labels = ['Male', 'Female']

fig, ax = plt.subplots(figsize=(4, 4), dpi=100)
explode = (0, 0.06)

patches, texts, autotexts = ax.pie(values, labels=labels, autopct='%1.2f%%', shadow=True,
                                   startangle=90, explode=explode)

plt.setp(texts, color='black')
plt.setp(autotexts, size=12, color='white')
autotexts[1].set_color('black')
plt.show()


## Plotting Distribution of Males in dataset

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20, 5))
sdat = dataset.groupby(by='Gender')
sdat.get_group("Male").plot(kind='hist', ax=ax, subplots=True, bins=40);


In [None]:
sns.pairplot(sdat.get_group("Male"));


## Plotting Distribution of Females in dataset

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20, 5))
sdat = dataset.groupby(by='Gender')
sdat.get_group("Female").plot(kind='hist', ax=ax, subplots=True, bins=40);


In [None]:
sns.pairplot(sdat.get_group("Female"));


## Skewness Correction
I found out there were some columns with skewness in the dataset. Here, I'm trying to correct that Skewness

In [None]:
def skewnessCorrector(columnName):
    print('''Before Correcting''')
    (mu, sigma) = norm.fit(dataset[columnName])
    print("Mu before correcting {} : {}, Sigma before correcting {} : {}".format(
        columnName.capitalize(), mu, columnName.capitalize(), sigma))
    plt.figure(figsize=(20, 10))
    plt.subplot(1, 2, 1)
    sns.distplot(dataset[columnName], fit=norm, color="lightcoral");
    plt.title(columnName.capitalize() +
              " Distplot before Skewness Correction", color="black")
    plt.subplot(1, 2, 2)
    stats.probplot(dataset[columnName], plot=plt)
    plt.show()
    dataset[columnName], lam_fixed_acidity = boxcox(
        dataset[columnName])
    print('''After Correcting''')
    print("Mu after correcting {} : {}, Sigma after correcting {} : {}".format(
        columnName.capitalize(), mu, columnName.capitalize(), sigma))
    plt.figure(figsize=(20, 10))
    plt.subplot(1, 2, 1)
    sns.distplot(dataset[columnName], fit=norm, color="orange");
    plt.title(columnName.capitalize() +
              " Distplot After Skewness Correction", color="black")
    plt.subplot(1, 2, 2)
    stats.probplot(dataset[columnName], plot=plt)
    plt.show()


In [None]:
skewColumnList = ['Age',
                  'Annual Income (k$)', 'Spending Score (1-100)']
for columns in skewColumnList:
    skewnessCorrector(columns)


# 2) Using Pandas Profiling

In [None]:
# pip install pandas_profiling

In [None]:
ProfileReport(dataset)

# Training Clustering Models on Dataset

## Function to find the optimal number of clusters using elbow method

In [None]:
def elbowOptimizer(data):
    """Plots a Elbow Chart on the data provided"""
    wcss = []
    for i in range(1, 11):
        kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
        kmeans.fit(data)
        wcss.append(kmeans.inertia_)
    plt.plot(range(1, 11), wcss)
    plt.title('The Elbow Method')
    plt.xlabel('Number of clusters')
    plt.ylabel('WCSS')
    plt.show();


## Function to find optimal numbers of clusters using Dendograms

In [None]:
def dendoOptimizer(data):
    """Plots a Dendogram Plot on the data provided"""
    sch.dendrogram(sch.linkage(data, method='ward'))
    plt.title('Dendrogram')
    plt.xlabel('Customers')
    plt.ylabel('Euclidean distances')
    plt.show()


## Function for Training K-Means Model on Given Data

In [None]:
def kmeansTrainer(numberOfClusters, data):
    """
    Trains KMeans Clustering Algorithm on data with
    number of clusters provided and Returns corresponding Model and Labels
    """
    kmeans = KMeans(n_clusters=numberOfClusters, init='k-means++', random_state=42)
    labels = kmeans.fit_predict(data)
    return (kmeans,labels)


## Function for Training  Hierarchical Clustering model on given data

In [None]:
def heirarchicalTrainer(noOfClusters, data):
    """
    Trains Agglomerative Clustering Algorithm on data with
    number of clusters provided and Returns corresponding Model and Labels
    """
    hc = AgglomerativeClustering(
        n_clusters=noOfClusters, affinity='euclidean', linkage='ward')
    hc_labels = hc.fit_predict(data)
    return (hc, hc_labels)


## Function for visualising 2-d Clusters

In [None]:
def clusterVisualiser(data, model, noOfClusters, labels, xlabel, ylabel, model_type):
    """Plots Scatter Plot for the clusters on the Data given"""
    color= ['red', 'blue', 'green', 'cyan', 'magenta','purple']
    for i in range(0, noOfClusters):
        plt.scatter(data[labels == i, 0], data[labels == i, 1 ], s=100, c=color[i], label ='Cluster '+str(i+1))
    if model_type == 'KMeans Clustering':
        plt.scatter(model.cluster_centers_[:, 0], model.cluster_centers_[
                    :, 1], s=300, c='yellow', label='Centroids')   
    plt.title('Clusters of customers using '+model_type)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()
    plt.show()


# Clustering dataset on Annual Income and Spending Score

In [None]:
X1 = dataset.iloc[:, [2, 3]].values

## Using the elbow method to find Optimal Clusters for Annual Income and Spending Score 

In [None]:
elbowOptimizer(X1)

## Using the dendrogram to find the optimal number of clusters for Annual Income and Spending Score

In [None]:
dendoOptimizer(X1)

## Training the K-Means model on Annual Income and Spending Score

In [None]:
kmeans, labels = kmeansTrainer(5, X1)

## Training the Heirarchical model on Annual Income and Spending Score

In [None]:
hc, hc_labels = heirarchicalTrainer(5, X1)

## Visualising the clusters for Annual Income and Spending Score (Kmeans Model)

In [None]:
clusterVisualiser(X1, kmeans, 5, labels,
                  'Annual Income (k$)', 'Spending Score (1-100)', 'KMeans Clustering')


## Visualising the clusters for Annual Income and Spending Score (Heirarchical Model)

In [None]:
clusterVisualiser(X1, hc, 5, hc_labels,
                  'Annual Income (k$)', 'Spending Score (1-100)', 'Heirarchical Clustering')


# Clustering the dataset on Age and Spending Score

In [None]:
X2 = dataset.iloc[:, [1, 3]].values


## Using the elbow method to find the optimal number of clusters for Age and Spending Score

In [None]:
elbowOptimizer(X2)

## Using the dendrogram to find the optimal number of clusters for Annual Income and Spending Score

In [None]:
dendoOptimizer(X2)

## Training K-Means Model on Age and Spending Score

In [None]:
kmeans, labels = kmeansTrainer(4, X2)

## Training the Heirarchical model on Age and Spending Score

In [None]:
hc, hc_labels = heirarchicalTrainer(4, X2)

## Visualising the clusters for Age and Spending Score (Kmeans Model)

In [None]:
clusterVisualiser(X2, kmeans, 4, labels,
                  'Age', 'Spending Score (1-100)', 'KMeans Clustering')

## Visualising the clusters for Age and Spending Score (Heirarchical Model)

In [None]:
clusterVisualiser(X2, hc, 4, hc_labels,
                  'Age', 'Spending Score (1-100)', 'Heirarchical Clustering')


# Clustering the dataset on Age and Annual Income 

In [None]:
X3 = dataset.iloc[:, [1,2]].values

## Using the elbow method to find the optimal number of clusters for Age and Annual Income

In [None]:
elbowOptimizer(X3)

## Using dendogram to find the optimal number of clusters for Age and Annual Income

In [None]:
dendoOptimizer(X3)

## Training K-Means Model on Age and Annual Income 

In [None]:
kmeans, labels = kmeansTrainer(5, X3)

## Training Heirarchical Clustering on Age and Annual Income

In [None]:
hc, hc_labels = heirarchicalTrainer(5, X3)

## Visualising the clusters for Age and Annual Income (KMeans Model)

In [None]:
clusterVisualiser(X3, kmeans, 5, labels,
                  'Age', 'Annual Income', 'KMeans Clustering')

## Visualising the clusters for Age and Annual Score (Heirarchical Model)

In [None]:
clusterVisualiser(X3, hc, 5, hc_labels,
                  'Age', 'Annual Income', 'Heirarchical Clustering')


# Clustering the dataset on Age, Annual Income and Spending Score

In [None]:
X4 = dataset.iloc[:, 1:]

## Using the elbow method to find the optimal number of clusters for Age, Annual Income and Spending Score

In [None]:
elbowOptimizer(X4)

## Using the dendogram method to find the optimal number of clusters for Age, Annual Income and Spending Score

In [None]:
dendoOptimizer(X4)

## Training K-Means Model on Age, Annual Income and Spending Score

In [None]:
kmeans, labels = kmeansTrainer(6, X4)
X4['label'] = labels


## Training Heirarchical Model on Age, Annual Income and Spending Score

In [None]:
hc, hc_labels = heirarchicalTrainer(6, X4)
X4['hc_labels'] = hc_labels

## Visualising the clusters for Age, Annual Income and Spending Score (KMeans Model)

In [None]:
fig = px.scatter_3d(X4, x="Annual Income (k$)", y="Spending Score (1-100)", z="Age",
                    color='label', size='label')
fig.show()


## Visualising the clusters for Age, Annual Income and Spending Score (Heirarchical Model)

In [None]:
fig = px.scatter_3d(X4, x="Annual Income (k$)", y="Spending Score (1-100)", z="Age",
                    color='hc_labels', size='hc_labels')
fig.show()


# **Please Give Feedback by Commenting below and if you like my work please Consider Upvoting.** 