## **Importing some useful libraries**

In [None]:
#Importing important libraries

import numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import pandas_profiling as pp

## Importing dataset 

In [None]:
dataset = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')

In [None]:
dataset.describe()

## Detailed EDA of the dataset using pandas profiling

In [None]:
pp.ProfileReport(dataset)

### *As seen from the correlation chart,there is no true corelation between any variables

In [None]:
#Dropping customerID as its useless for our clustering analysis
dataset.drop('CustomerID',axis=1,inplace =True)

In [None]:
g=sns.pairplot(dataset)
g.fig.set_size_inches(15,15)

### *Annual Income higher than 100k can be seen only for people in the age range 30-50
### *Another intersing pattern of cluster formation can be seen between datapoints in the AnnualIncome-SpendingScore chart-Almost 5 clusters.

In [None]:
#Changing the gender column from categorical to numerical
dataset.loc[dataset.Gender == 'Male' ,'Gender'] = 1
dataset.loc[dataset.Gender == 'Female' ,'Gender'] = 0

dataset.Gender= dataset.Gender.astype(int)

## Scaling as in a general sense,differnt scale might affect the cluster formation due to distance difference between features .In this specific dataset,even an unscaled feature set works(I have tried.)

In [None]:

from sklearn.preprocessing import StandardScaler
columns = dataset.columns.values.tolist()
for i in columns:
    if i != 'Gender':
        ss = StandardScaler()
        scaled = ss.fit_transform(dataset[[i]])
        dataset[i] = scaled

## Cluster Number Selection Based on Inertia Method

In [None]:
#Selecting the best number of cluster based on intertia scoring
km_list = list()

for clust in range(1,15):
    km = KMeans(n_clusters=clust,init='k-means++', random_state=42)
    km = km.fit(dataset)
    
    km_list.append(pd.Series({'clusters': clust, 
                              'inertia': km.inertia_,
                              'model': km}))

In [None]:
#Plotting the effect of increased cluser numbers on inertia
plot_data = (pd.concat(km_list, axis=1)
             .T
             [['clusters','inertia']]
             .set_index('clusters'))

ax = plot_data.plot(marker='o',ls='-')
ax.set_xticks(range(0,15,1))
ax.set_xlim(0,16)
ax.axhline(y=190, c='black',ls='--')
ax.set(xlabel='Cluster', ylabel='Inertia');


### *The dotted line shows that after 5 clusters are formed,there is no large decrease in inertia value.We can choose either 4n or 5 according to our threshold inertia value.I took 5 as it is evident from the earlier graph of Scoring and Income.Note:This if we want to cluster based on that specific features.

## Selecting the cluster number based on Hierarchical Clustering Distance Threshold 

In [None]:
from sklearn.cluster import AgglomerativeClustering
#Note the base n_cluster value of 2 is taken to show the whole tree
ag = AgglomerativeClustering(n_clusters=2, linkage='ward', compute_full_tree=True)
ag = ag.fit(dataset)

In [None]:
from scipy.cluster import hierarchy
Z = hierarchy.linkage(ag.children_, method='ward')

fig, ax = plt.subplots(figsize=(15,5))

# Some color setup
red = 'red'
blue = 'blue'
threshold = 480

hierarchy.set_link_color_palette([red, 'green'])

den = hierarchy.dendrogram(Z, orientation='top', 
                           p=30, truncate_mode='lastp',
                           show_leaf_counts=True, ax=ax,
                           above_threshold_color=blue)
ax.axhline(y=threshold, c='black',ls='--')
ax.set_ylabel('Distance')
ax.set_xlabel('datapoint_Count')
plt.show()

### *From the dendrogram ,it is eveident from the dotted line that the distance doesnt decrease drastically if we increase the cluster number even further. Both these analysis points to taking the number of clusters as 5. 

## 1.Clustering with the entire dataset taken.

In [None]:
#Kmeans with 5 clusters
km = KMeans(n_clusters=5,init='k-means++', random_state=36)
km = km.fit(dataset)
dataset['km'] = km.fit_predict(dataset)

In [None]:
#AgglomerativeClustering with ward linkage
ag = AgglomerativeClustering(n_clusters=5, linkage='ward', compute_full_tree=True)
ag = ag.fit(dataset.iloc[:,:-1])
dataset['agglom'] = ag.fit_predict(dataset)

In [None]:
#Plotting the clusters predicted.
color = 'brgym'
alpha = 0.5
labels=['Cluster1','Cluster2','Cluster3','Cluster4','Cluster5']
for i in range(5):
    plt.scatter(dataset['Annual Income (k$)'][km.labels_==i],dataset['Spending Score (1-100)'][km.labels_==i],c = color[i],alpha = alpha,s=20)
    plt.scatter(km.cluster_centers_[i,2],km.cluster_centers_[i,3],c = color[i], marker = 'X', s = 200,label=labels[i])
plt.ylabel('Annual Income')
plt.xlabel('Spending Score')
plt.legend()

### *It is evident that the model could not predict these two features entirely.However this might be also due to the fact that it might have given other features(other than Annual Income or Spending Score) more importance.
### *Cluster 5 and 3 are wrongly labelled or interchanged.

In [None]:
#Using agglomerative hierarchichal clustering labels
color = 'brgcm'
alpha = 0.5
for i in range(5):
    plt.scatter(dataset['Annual Income (k$)'][ag.labels_==i],dataset['Spending Score (1-100)'][ag.labels_==i],c = color[i],alpha = alpha,s=20)
plt.ylabel('Annual Income')
plt.xlabel('Spending Score')

### *Almost exact result using agglomerative clustering too.
### *We can either use this or give the selected two variables more importance by clustering just on the basis of these two variables.

In [None]:
color = 'brgym'
alpha = 0.5
for i in range(5):
    plt.scatter(dataset['Annual Income (k$)'][ag.labels_==i],dataset['Age'][ag.labels_==i],c = color[i],alpha = alpha,s=20)
plt.ylabel('Annual Income')
plt.xlabel('Age')
plt.legend()

### The above graph cements our earlier insight that other features might have had more importane in pur clustering as the model almost succesfully clustered the datapoints in the above chart.

## 2.Clustering with just the two features taken

In [None]:
#Taking just 'Annual Income (k$)', 'Spending Score (1-100)' for clustering
dataset_small = dataset[['Annual Income (k$)', 'Spending Score (1-100)']]
km = KMeans(n_clusters=5,init='k-means++', random_state=42)
km = km.fit(dataset_small)
dataset_small['km'] = km.fit_predict(dataset_small)


In [None]:
#Plotting
color = 'brgcm'
alpha = 0.5
for i in range(5):
    plt.scatter(dataset_small['Annual Income (k$)'][km.labels_==i],dataset_small['Spending Score (1-100)'][km.labels_==i],c = color[i],alpha = alpha,s=20)
    plt.scatter(km.cluster_centers_[i,0],km.cluster_centers_[i,1],c = color[i], marker = 'X', s = 200,label=labels[i])
plt.ylabel('Annual Income')
plt.xlabel('Spending Score')
plt.legend()

### *As expected the model correclty formed 5 clusters without large number of outliers.

In [None]:
#Checking the new model in the old dataset with different features
for i in range(5):
    plt.scatter(dataset['Annual Income (k$)'][km.labels_==i],dataset['Age'][km.labels_==i],c = color[i],alpha = alpha,s=20,label=labels[i])
plt.ylabel('Annual Income')
plt.xlabel('Age')
plt.legend()

In [None]:
#So ,in this specifi dataset,selecting just the two variables and clustering is a more apt solution than takin the whole dataset
#We will replce the predicted labels column of the old dataset with the new one
dataset['km']=dataset_small['km']

## PCA and Variance Explanation Analysis

In [None]:
#Checking if we can reduce the number of features by using PCA
dataset_pca = dataset.iloc[:,:-2]

In [None]:
from sklearn.decomposition import PCA

feature_weight = []
variance_explained = []

# We can select upto 4 features as it is tha max

for n in range(1, 4):
    
    PCAmod = PCA(n_components=n)
    PCAmod.fit(dataset_pca)
    
    # Store the model and variance
    variance_explained.append(PCAmod.explained_variance_ratio_.sum())
    
    # Calculate and store feature importances
    abs_feature_values = np.abs(PCAmod.components_).sum(axis=0)
    feature_weight.append(pd.DataFrame({'n':n, 
                                             'features': dataset_pca.columns,
                                             'values':abs_feature_values/abs_feature_values.sum()}))
var=pd.DataFrame(variance_explained)
var   

In [None]:
features_df = (pd.concat(feature_weight)
               .pivot(index='n', columns='features', values='values'))

features_df

### *From the above dataframe,it is clear that Gender as a feature have the least importance in all out PCA lists.Age and Gender forms the maximum importaent duo followed closely by Annual Income

In [None]:
#Plotting the variance explanation
ax = var.plot(kind='bar')

ax.set(xlabel='Number of dimensions',
       ylabel='Percent explained variance',
       title='Explained Variance vs Dimensions')

### *So just with 2 dimesions,we can capture almost 72% of the dataset variance.If it is increased to 3 ,more than 92 percent can be captured ,so that we can reduce with a single or two features from this dataset without losing much information.