In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        file = os.path.join(dirname, filename)
        print(file)

# Any results you write to the current directory are saved as output.

In [None]:
ds = pd.read_csv(file)
print(ds.columns)
print(ds.info())
print(ds.isnull().sum())


In [None]:
print(ds.describe())

* 25% of the customers have their age between 18 and 29 years
* 50% of the customers have their age between 18 and 36 years
* 50% of the customers have an annual income less than 62K, maximum being 137K
* 50% of the customers have a spending score of 50

Gender is the only non-numerical feature - we will encode it using LabelEncoder()

In [None]:
print(ds["Gender"].unique())

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ds["enc_gender"] = le.fit_transform(ds["Gender"])
'''Get gender integer label mapping'''
mapping = {l: i for i, l in enumerate(le.classes_)}
print(mapping)
print(ds.head())

'''Drop previous Gender column and rename other columns for better access'''
ds.drop(["Gender"], axis=1, inplace=True)
ds.rename(columns={'Age': 'age', 'Annual Income (k$)': 'annual_income_k', 'Spending Score (1-100)':'spending_score','CustomerID':'customer_id' }, inplace=True)
print(ds.head())


Obtained the following gender labels encoding mapping: {'Female': 0, 'Male': 1}

Histograms of all features

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

f = plt.figure(figsize=(8,5))
ax = f.add_subplot(121)
sns.distplot(ds["enc_gender"], kde=False)

ax = f.add_subplot(122)
sns.distplot(ds["age"])

f = plt.figure(figsize=(8,5))

ax = f.add_subplot(121)
sns.distplot(ds["annual_income_k"])

ax = f.add_subplot(122)
sns.distplot(ds["spending_score"])

* There are more female customers than male customers
* The most significat age segment is between 30 and 40 years,2nd is around 20 years and 3rd between 40 and 50
* Most significant segments in terms of annual income: 60K and 75 - 80K
* Spending score is the most important indicator in terms of sales - most significant: between 50 and 60, 2nd most significant 50 - 60; it is interesting to notice that 3rd is a score close to 100


Check the correlation between the features

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
corr = ds.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(240,10,as_cmap=True),square=True, ax=ax)

There is a high correlation between customer_id and annual_income.
There is also a small negative correlation between spending score and age (as one of them increases, the other decreases - so we might say that younger customers have the tendency to spend more). We will explore in detail the relation between each feature.

In [None]:
sns.pairplot(ds)

* customer_id increases with the annual_income_k
* spending score / annual_income_k, spending score / customer_id form groups of clients
* clients under 40 have a higher spending score
* clients with the highest annual income are between 30 and 50 years old

In [None]:
sns.relplot(x="age", y="spending_score", hue="enc_gender", data=ds)
sns.relplot(x="annual_income_k", y="spending_score", hue="enc_gender", data=ds)

First plot (age / spending_score)
* higher spending scores (60 - 100) correspond to clients between 18 and 40, with a higher number of females than males.
* spending scores between 40 - 60 is distributed more uniformely over all the ages
* a spending score between 0 and 40 is more common to clients aged between 35 and 55

Second plot (annual income / spending score)
* anual income 20 - 40K have a spending_score between 0 to 40  / 70 to 100
* annual income 40 - 70K are grouped at 40 to 60 pending score
* annual income 75K - 105K are grouped at a spending score between 0 to 40 / 70 to 100
* annual incomes over 115K look like outliers

 In order to avoid the correlation between customer_id and annual_income to influence the clusters, I will drop customer_id.

In [None]:
ds.drop(["customer_id"], axis=1, inplace=True)

Determining the optimal number of clusters using the elbow method
(from previous plots we obtained the indication of 5 clusters)

In [None]:
ds.head()

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt


distortions = []
silhouette_sc = []
K = range(1,10)
for k in K:
    model = KMeans(n_clusters=k, random_state=15)
    model.fit(ds)
    distortions.append(sum(np.min(cdist(ds, model.cluster_centers_, 'euclidean'), axis=1)) / ds.shape[0])
    if k>=2:
        silhouette_sc.append(metrics.silhouette_score(ds, model.labels_))

plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

plt.plot(range(2,10), silhouette_sc, 'bx-')
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score')
plt.show()



In [None]:
model = KMeans(n_clusters=6, init='k-means++', max_iter=300, n_init=10, random_state=15)
y_pred = model.fit_predict(ds)
ds['cluster'] = y_pred
ds.head()

In [None]:
sns.scatterplot(x="annual_income_k", y="spending_score",hue="cluster", legend="full", data=ds)
sns.scatterplot(x=model.cluster_centers_[:,1], y=model.cluster_centers_[:,2], color='r')

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
ds_features = ds.drop(["cluster"], axis=1)
ds_reduced = pca.fit_transform(ds_features)
ds2 = pd.DataFrame(ds_reduced, columns=["C1", "C2"])
ds2["clusters"] = pd.DataFrame(y_pred, columns=["clusters"])
sns.scatterplot(x="C1", y="C2",hue="clusters", legend="full", data=ds2)

In [None]:
model = KMeans(n_clusters=5, init='k-means++', max_iter=300, n_init=10, random_state=15)
y_pred = model.fit_predict(ds)
ds['cluster'] = y_pred
ds.head()

In [None]:
sns.scatterplot(x="annual_income_k", y="spending_score",hue="cluster", legend="full", data=ds)
sns.scatterplot(x=model.cluster_centers_[:,1], y=model.cluster_centers_[:,2], color='r')

In [None]:
model.cluster_centers_

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
ds_features = ds.drop(["cluster"], axis=1)
ds_reduced = pca.fit_transform(ds_features)

In [None]:
ds_reduced

In [None]:

ds2 = pd.DataFrame(ds_reduced, columns=["C1", "C2"])
ds2["clusters"] = pd.DataFrame(y_pred, columns=["clusters"])
ds2

In [None]:
sns.scatterplot(x="C1", y="C2",hue="clusters", legend="full", data=ds2)