In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.mixture import GaussianMixture
from sklearn import metrics
from sklearn.cluster import KMeans

# Loading Data

In [None]:
file_path = '../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv'
mail_customers = pd.read_csv(file_path)

In [None]:
mail_customers.head()

In [None]:
# check whether there is any null value in each column
mail_customers.isnull().any()

> There are no missing value in each column, so don't need to handle NaN values.

In [None]:
# Describe the data
mail_customers.describe()

# Data Visualization

## Distribution of gender

In [None]:
male_percentage = round(len(mail_customers.Gender[mail_customers.Gender == 'Male'])/len(mail_customers.Gender)*100,2)
female_percentage = round(len(mail_customers.Gender[mail_customers.Gender == 'Female'])/len(mail_customers.Gender)*100,2)
list = [male_percentage,female_percentage]
plt.figure(figsize=(6,6))
plt.pie(list,labels = ['Male','Female'],autopct='%2.1f%%',shadow=True,explode = [0.05,0.05])
plt.title('Customer gender ratio')
plt.legend(loc="upper right")

> From the pie chart, we can see that a majority of customers (56%) are female, and 44% of customers are male.

## The distribution of Age

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(a = mail_customers['Age'], color = 'red')
plt.title('Distribution of Age', fontsize = 15)
plt.xlabel('Range of Age')
plt.ylabel('Count')

> From the plot, we can see the age of majority of people lies between 30 to 40. 

## Separating age into age group

In [None]:
bins = [min(mail_customers.Age)-1, 20, 30, 40, 50, 60, 70, max(mail_customers.Age)+1]
labels = ['below 20','20-30','30-40','40-50','50-60','60-70','above 70']
mail_customers['Age group'] = pd.cut(mail_customers.Age,bins,labels = labels) 

In [None]:
aggResult1 = mail_customers.groupby(by=['Age group'])['Age group'].count()
sns.set(style="whitegrid")
plt.figure(figsize=(10,6))
sns.barplot(x = aggResult1.index,y = aggResult1).set_ylabel('Number of People')


## Distribution of Annual Income

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(a = mail_customers['Annual Income (k$)'])
plt.title('Distribution of Annual Income', fontsize = 15)
plt.xlabel('Range of Annual Income')
plt.ylabel('Count')

## Distribution of Annual Income grouped by gender

In [None]:

plt.figure(figsize=(10,6))
sns.boxplot(x='Age group',y='Annual Income (k$)',hue='Gender',data=mail_customers,palette='pastel')


> From the plot above, it is obvious that the 20-30 and 30-40 age groups have relatively higher annual income than other groups. Also, the income of male is generally higher than that of female except for the age group of 50-60.

## Relationship between each pair of attributes

In [None]:
sns.pairplot(mail_customers.iloc[:,1:5],hue='Gender')

 > The plot above shows the distribution of each pair of attributes, seperated by the gender. We can see that there is no obvious pattern in each plot, so I separate customers using all these attributes.

## Heatmap of attributes

In [None]:
column = ['Age','Annual Income (k$)','Spending Score (1-100)']
data = mail_customers.loc[0:,column]
plt.figure(figsize=(14,6))
sns.heatmap(data=data.corr(),cmap='viridis', annot=True)

> From the graph above, We can see that the correlation coefficients are very small between each pair of attributes. So again, we can use clustering method with all these attributes.

# Clustering using Kmeans
> In order to evaluate the quality of the cluster analysis results. I use Silhouette Score to evaluate the results of cluster analysis. When si is close to 1, it means that the clustering of sample i is reasonable.If si is close to -1, it indicates that sample i should be classified into another cluster.If si is approximately 0, it means that sample i is on the boundary of two clusters. Larger Silhouette Score, close to 1, means better clustering result.

 ## 1.Segmentation using Age, Annual Income and Spending Score (1-100)

In [None]:
silhouette_all=[]
for k in range(2,11):
    kmeans_model = KMeans(n_clusters=k, random_state=1).fit(data)
    labels = kmeans_model.labels_
    a = metrics.silhouette_score(data, labels, metric='euclidean')
    silhouette_all.append(a)
    #print(a)
    print('This is the silhouette score when k equals',k,': ',a)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(2,11), silhouette_all, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('silhouette score')
plt.annotate('max score', xy=(6, 0.4523443947724053),arrowprops=dict(facecolor='black'))

In [None]:
km = KMeans(n_clusters = 6, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_means = km.fit_predict(data)
plt.figure(figsize=(10,6))
sns.countplot(y_means)

In [None]:
x = data.values
fig = plt.figure(figsize = (10,10))
fig = fig.add_subplot(111, projection='3d')
plt.scatter(x[y_means == 0,0], x[y_means == 0,1], x[y_means == 0,2], c = 'green')
plt.scatter(x[y_means == 1,0], x[y_means == 1,1],x[y_means == 1,2], c = 'yellow')
plt.scatter(x[y_means == 2, 0], x[y_means == 2, 1], x[y_means == 2,2], c = 'cyan')
plt.scatter(x[y_means == 3, 0], x[y_means == 3, 1], x[y_means == 3,2],c = 'magenta')
plt.scatter(x[y_means == 4, 0], x[y_means == 4, 1], x[y_means == 4,2],c = 'orange')
plt.scatter(x[y_means == 5, 0], x[y_means == 5, 1],x[y_means == 5,2], c = 'red')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:, 1],km.cluster_centers_[:, 2], c = 'blue' , label = 'centeroid')
fig.set_xlabel('Age of a customer')
fig.set_ylabel('Anual Income')
fig.set_zlabel('Spending Score')
fig.set_title('Clusters of Customers')


 ## 2.Segmentation using Age and Spending Score (1-100)

In [None]:
column2 = ['Age','Spending Score (1-100)']
data2 = mail_customers.loc[0:,column2]
silhouette_all2=[]
for k in range(2,11):
    kmeans_model = KMeans(n_clusters=k, random_state=1).fit(data)
    labels = kmeans_model.labels_
    a = metrics.silhouette_score(data2, labels, metric='euclidean')
    silhouette_all2.append(a)
    #print(a)
    print('This is the silhouette score when k equals',k,': ',a)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(2,11), silhouette_all2, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('silhouette score')
plt.annotate('max score', xy=(2, 0.4692341232501655),arrowprops=dict(facecolor='black'))

In [None]:
km = KMeans(n_clusters = 2, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_means = km.fit_predict(data2)
plt.figure(figsize=(10,6))
sns.countplot(y_means)
plt.xlabel('clusters')
plt.ylabel('counts in each cluster')

In [None]:
x = data2.values
fig = plt.figure(figsize = (16,10))
plt.scatter(x[y_means == 0,0], x[y_means == 0,1], s = 50, c = 'green', marker = 'o')
plt.scatter(x[y_means == 1,0], x[y_means == 1,1], s = 50, c = 'yellow', marker = 'v')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:, 1], s = 100,c = 'red' , label = 'centeroid')
plt.xlabel('Age of a customer')
plt.ylabel('Spending Score (1-100)')
plt.title('Clusters of Customers')

 ## 3.Segmentation using Annual Income and Spending Score (1-100)

In [None]:
column3 = ['Annual Income (k$)','Spending Score (1-100)']
data3 = mail_customers.loc[0:,column3]
silhouette_all3=[]
for k in range(2,11):
    kmeans_model = KMeans(n_clusters=k, random_state=1).fit(data)
    labels = kmeans_model.labels_
    a = metrics.silhouette_score(data3, labels, metric='euclidean')
    silhouette_all3.append(a)
    #print(a)
    print('This is the silhouette score when k equals',k,': ',a)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(2,11), silhouette_all3, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('silhouette score')
plt.annotate('max score', xy=(5, 0.5503719213912603),arrowprops=dict(facecolor='black'))

In [None]:
km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_means = km.fit_predict(data3)
plt.figure(figsize=(10,6))
sns.countplot(y_means)
plt.xlabel('clusters')
plt.ylabel('counts in each cluster')

In [None]:
plt.style.use('ggplot')
x = data3.values
fig = plt.figure(figsize = (16,10))
plt.scatter(x[y_means == 0,0], x[y_means == 0,1], s = 50, c = 'green', marker = 'o')
plt.scatter(x[y_means == 1,0], x[y_means == 1,1], s = 50, c = 'yellow', marker = 'v')
plt.scatter(x[y_means == 2, 0], x[y_means == 2, 1],s = 50, c = 'cyan',  marker = 's')
plt.scatter(x[y_means == 3, 0], x[y_means == 3, 1],s = 50, c = 'magenta',  marker = 'p')
plt.scatter(x[y_means == 4, 0], x[y_means == 4, 1],s = 50, c = 'orange', marker = 'x')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:, 1], s = 100,c = 'red' , label = 'centeroid')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score (1-100)')
plt.title('Clusters of Customers')