![](http://frankdatascience.files.wordpress.com/2018/09/1_2bpc6k2c4ojhp00ijxbska.jpeg?w=1023)

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")
df.head()

In [1]:
df.shape

In [1]:
#check missing values
df.isnull().sum()

In [1]:
df.Gender.value_counts()

In [1]:
df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1})

In [1]:
#understanding relationship between age and income visually
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

sn.lmplot("Age", "Annual Income (k$)", data = df, fit_reg = False, size = 5)

Highest income is around age 30-35

In [1]:
#scatterplot between age and spending score
sn.lmplot("Age", "Spending Score (1-100)", data = df, fit_reg = False, size = 5)

Lower the age higher the spending score

In [1]:
sn.lmplot("Gender", "Annual Income (k$)", data = df, fit_reg = False, size = 5)

Male annual income is slightly higher than Female annual income

In [1]:
sn.lmplot("Annual Income (k$)", "Spending Score (1-100)", data = df, fit_reg = False, size = 5)

Annual income of $40-60k corresponds to 40-60 spending score

In [1]:
sn.lmplot("Gender", "Spending Score (1-100)", data = df, fit_reg = False, size = 5)

Spending score of females are slightly higher than male

In [1]:
from sklearn.cluster import KMeans
clusters = KMeans(3)
clusters.fit(df)

In [1]:
df["cluster_id"] = clusters.labels_

In [1]:
#Plotting Clusters with their segments
markers = ['+', '^','.']
sn.lmplot(x="Age", y="Annual Income (k$)", data= df, hue = "cluster_id", fit_reg=False, markers = markers,
         height = 6)

### Normalizing features

In [1]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']])

In [1]:
from sklearn.cluster import KMeans
clusters_new = KMeans(3)
clusters_new.fit(scaled_df)
df['cluster_id_new'] = clusters_new.labels_

In [1]:
#Plotting Clusters with their segments
markers = ['+', '^','.']
sn.lmplot(x="Age", y="Annual Income (k$)", data= df, hue = "cluster_id_new", fit_reg=False, markers = markers,
         height = 6)

A significant change! Before normalizing, cluster id markers were different than now

In [1]:
clusters_new.cluster_centers_

In [1]:
#Cluster centres and their interpretation

df.groupby('cluster_id_new')['Age','Annual Income (k$)','Spending Score (1-100)'].agg(['mean','std']).reset_index()

## Finding optimal number of clusters using Elbow method

In [1]:
cluster_range = range(1,10)
cluster_errors = []

for num_clusters in cluster_range:
    clusters = KMeans(num_clusters)
    clusters.fit(scaled_df)
    #inertia parameter in KMeans provides total variance for a particular number of clusters
    cluster_errors.append(clusters.inertia_)
    
plt.figure(figsize=(10,8))
plt.plot(cluster_range, cluster_errors, marker = "o");

Elbow point is at 4 which indicates there might be 4 clusters in the dataset

In [1]:
#creating clusters

k = 4
clusters = KMeans(4, random_state = 42)
clusters.fit(scaled_df)
df['scaled_cluster_id'] = clusters.labels_
#remove the previous cluster ids
df= df.drop(['cluster_id', 'cluster_id_new'], axis = 1)

In [1]:
df[df.scaled_cluster_id ==0]

In [1]:
df[df.scaled_cluster_id ==1]

In [1]:
df[df.scaled_cluster_id ==2]

In [1]:
df[df.scaled_cluster_id ==3]

In [1]:
#Plotting Clusters with their segments
markers = ['+', '^','.', '*']
sn.lmplot(x="Age", y="Annual Income (k$)", data= df, hue = "scaled_cluster_id", fit_reg=False, markers = markers,
         height = 6)

## Hierarchical clustering

Algorithm:
### 1) Start with each data point in a single cluster
### 2) Find the data points with the shortest distance and merge them to form a cluster
### 3) Repeat step 2 until data points are merged together to form a single cluster

In [1]:
from sklearn.cluster import AgglomerativeClustering

h_cluster = AgglomerativeClustering(4)
h_cluster.fit(scaled_df)
df["h_clusterid"] = h_cluster.labels_

In [1]:
#Plotting Clusters with their segments
markers = ['+', '^','.', '*']
sn.lmplot(x="Age", y="Annual Income (k$)", data= df, hue = "h_clusterid", fit_reg=False, markers = markers,
         height = 6)

Upvote please if you liked it :)