In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Importing relevant libraries

In [None]:
!pip3 install kneed

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from kneed import KneeLocator
from yellowbrick.cluster import SilhouetteVisualizer

# 2. Understanding the dataset

In [None]:
customers_df = pd.read_csv("/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")
original_df = pd.read_csv("/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")
customers_df.head()

In [None]:
customers_df.drop(["CustomerID"], axis=1, inplace=True)
original_df.drop(["CustomerID"], axis=1, inplace=True)
customers_df.head()

In [None]:
customers_df.info()

In [None]:
customers_df.describe()

# 3. Exploratory Data Analysis

## 3.1 Univariate Analysis on Numerical variables

In [None]:
customers_df.head()
customers_numeric_df = customers_df.drop("Gender", axis=1)

In [None]:
plt.figure(figsize=(9,9))
plt.title("Box plots for 3 numeric varaibles")
plt.boxplot(customers_numeric_df.values,labels=["Age", "Annual Income (k$)","Spending Score (1-100)"])

## 3.2 Univariate Analysis on Categorical variables

### 3.2.1 Binning numerical values

In [None]:
age_list = customers_df["Age"].to_list()
age_bins = {}
age_bins["1-9"]=0
age_bins["10-19"]=0
age_bins["20-29"]=0
age_bins["30-39"]=0
age_bins["40-49"]=0
age_bins["50-59"]=0
age_bins["60-69"]=0
age_bins["70-79"]=0
for age in age_list:
    if age<=79 and age>=70:
        age_bins["70-79"] += 1
    elif age<=69 and age>=60:
        age_bins["60-69"] += 1
    elif age<=59 and age>=50:
        age_bins["50-59"] += 1
    elif age<=49 and age>=40:
        age_bins["40-49"] += 1
    elif age<=39 and age>=30:
        age_bins["30-39"] += 1
    elif age<=29 and age>=20:
        age_bins["20-29"] += 1
    elif age<=19 and age>=10:
        age_bins["10-19"] += 1
    elif age<=9 and age>=1:
        age_bins["1-9"] += 1

In [None]:
fig, ax = plt.subplots()
fig = plt.figure(figsize=(18,18))
x=list(age_bins.keys())
y=list(age_bins.values())
ax.vlines(x,ymin=0,ymax=y,color="g")
ax.plot(x,y,"o", color="maroon")
ax.set_xticklabels(x,rotation=90)
ax.set_ylabel("Count of age groups")
ax.set_title("Age groups")

In [None]:
pc_labels = ["Male", "Female"]
male_count = customers_df[customers_df["Gender"] == "Male"].shape[0]
female_count = customers_df[customers_df["Gender"] == "Female"].shape[0]
ax = plt.figure(figsize=(8,8))
ax.set(facecolor="white")
plt.pie([male_count,female_count],labels=pc_labels,autopct='%1.1f%%', explode = [0, 0.1], startangle=15, shadow=True)
plt.title('Gender distribution')
plt.axis('equal')
plt.show()

## 3.3 Bivariate Analysis

### 3.3.1 Gender and Age

In [None]:
plt.figure(figsize=(12,5))
plt.title("Box plot for age per gender")
sns.boxplot(y="Gender", x="Age", data = customers_df , orient="h", palette = 'rainbow')

In [None]:
plt.figure(figsize=(12,5))
plt.title("Distribution of age of males")
ax = sns.distplot(customers_df[customers_df["Gender"]=="Male"]["Age"], color = 'b')

In [None]:
plt.figure(figsize=(12,5))
plt.title("Distribution of age of males")
ax = sns.distplot(customers_df[customers_df["Gender"]=="Female"]["Age"], color = 'r')

### 3.3.2 Annual Income and Gender

In [None]:
f = plt.figure(figsize=(12,5))
ax = f.add_subplot(121)
sns.histplot(customers_df[customers_df["Gender"]=="Male"]["Annual Income (k$)"], color='b', ax=ax, kde=True)
ax.set_title("Distribution of annual income for males")

ax = f.add_subplot(122)
sns.histplot(customers_df[customers_df["Gender"]=="Female"]["Annual Income (k$)"], color='r', ax=ax, kde=True)
ax.set_title("Distribution of annual income for females")

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(customers_df[customers_df["Gender"]=="Male"]["Annual Income (k$)"], color = 'blue')
sns.distplot(customers_df[customers_df["Gender"]=="Female"]["Annual Income (k$)"], color = 'red')

In [None]:
plt.figure(figsize=(12,8))
sns.violinplot(x="Gender", y="Annual Income (k$)", data= customers_df)

In [None]:
plt.figure(figsize=(12,5))
plt.title("Box plot for Annual Income per gender")
sns.boxplot(y="Gender", x="Annual Income (k$)", data = customers_df , orient="h", palette = "flare")

Interesting outlier in male category...

### 3.3.3 Spending Score and Gender

In [None]:
f = plt.figure(figsize=(12,5))
ax = f.add_subplot(121)
sns.histplot(customers_df[customers_df["Gender"]=="Male"]["Spending Score (1-100)"], color='b', ax=ax, kde=True)
ax.set_title("Distribution of spending score for males")

ax = f.add_subplot(122)
sns.histplot(customers_df[customers_df["Gender"]=="Female"]["Spending Score (1-100)"], color='r', ax=ax, kde=True)
ax.set_title("Distribution of spending score for females")

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(customers_df[customers_df["Gender"]=="Male"]["Spending Score (1-100)"], color = 'blue', label="Male")
sns.distplot(customers_df[customers_df["Gender"]=="Female"]["Spending Score (1-100)"], color = 'red', label="Female")
plt.legend()

In [None]:
plt.figure(figsize=(12,8))
plt.title("Violin plot: Spending score vs Gender")
ax = sns.violinplot(x="Gender", y="Spending Score (1-100)", data= customers_df)

In [None]:
plt.figure(figsize=(12,5))
plt.title("Box plot for spending score per gender")
sns.boxplot(y="Gender", x="Spending Score (1-100)", data = customers_df , orient="h", palette = "viridis")

### 3.3.4 Spending Score and Annual Income

In [None]:
x = customers_df["Spending Score (1-100)"].to_list()
y = customers_df["Annual Income (k$)"].to_list()
fig = plt.figure(figsize=(8,8))
plt.scatter(x, y)
fig.suptitle("Spending score vs annual income", fontsize=20)
plt.xlabel("Spending score (1-100)", fontsize=16)
plt.ylabel("Annual Income (k$)", fontsize=16)

## 3.4 Multivariate Analysis

### 3.4.1 Annual Income by Age & Gender

In [None]:
p = sns.jointplot(x = 'Age', y = 'Annual Income (k$)', data = customers_df, color="g")
p.fig.suptitle("Annual Income by Age (all genders)")
p.fig.tight_layout()

In [None]:
p = sns.jointplot(x = 'Age', y = 'Annual Income (k$)', data = customers_df[customers_df["Gender"]=="Male"], color="b")
p.fig.suptitle("Annual Income by Age (males only)")
p.fig.tight_layout()

In [None]:
p = sns.jointplot(x = 'Age', y = 'Annual Income (k$)', data = customers_df[customers_df["Gender"]=="Female"], color="r")
p.fig.suptitle("Annual Income by Age (Female only)")
p.fig.tight_layout()

In [None]:
sns.lmplot(x="Age", y="Annual Income (k$)", hue="Gender", data=customers_df, palette = "cubehelix", size = 7)

### 3.4.2 Spending Score by Age & Gender

In [None]:
p = sns.jointplot(x = 'Age', y = 'Spending Score (1-100)', data = customers_df, color="g")
p.fig.suptitle("Spending Score by Age (all genders)")
p.fig.tight_layout()

In [None]:
p = sns.jointplot(x = 'Age', y = 'Spending Score (1-100)', data = customers_df[customers_df["Gender"]=="Male"], color="b")
p.fig.suptitle("Spending Score by Age (males only)")
p.fig.tight_layout()

In [None]:
p = sns.jointplot(x = 'Age', y = 'Spending Score (1-100)', data = customers_df[customers_df["Gender"]=="Female"], color="r")
p.fig.suptitle("Spending Score by Age (Female only)")
p.fig.tight_layout()

In [None]:
sns.lmplot(x="Age", y="Spending Score (1-100)", hue="Gender", data=customers_df, palette = 'inferno_r', size = 7, robust=True)

In [None]:
sns.lmplot(x="Age", y="Spending Score (1-100)", hue="Gender", data=customers_df, palette = 'inferno_r', size = 7)

### 3.4.3 All attributes

In [None]:
temp_df = pd.DataFrame(customers_df)
gender_encode = {"Gender":{"Male":0, "Female":1}}
temp_df.replace(gender_encode, inplace=True)
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')

img = ax.scatter(temp_df["Age"].to_list(), temp_df["Annual Income (k$)"].to_list(), temp_df["Spending Score (1-100)"].to_list(), c=temp_df["Gender"].to_list(), cmap="RdYlGn")
plt.xlabel("Age")
plt.ylabel("Annual Income (k$)")
ax.set_zlabel("Spending Score (1-100)")
fig.colorbar(img)
plt.show()

# 4. Clustering

## 4.1 Preprocessing

In [None]:
gender_encode = {"Gender":{"Male":0, "Female":1}}
customers_df.replace(gender_encode, inplace=True)

In [None]:
customers_df.head()

## 4.2 DBSCAN

### 4.2.1 Using K-Nearest Neighbors to determine optimal epsilon

In [None]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(customers_df)
distances, indices = nbrs.kneighbors(customers_df)
distances = np.sort(distances, axis=0)
distances[:,1]
eps_kneedle  = KneeLocator(
    range(len(distances[:,1])),
    distances[:,1],
    curve='convex',
    direction='increasing'
)
print(f"Optimal eps is {round(distances[:,1][int(eps_kneedle.elbow)],3)}")
eps_kneedle.plot_knee()

### 4.2.2 Silhouette score

In [None]:
db_clustering = DBSCAN(eps=9.327,min_samples=5, metric="euclidean").fit(customers_df)
labels = db_clustering.labels_
print(f"Silhouette Score: {silhouette_score(customers_df, labels, metric='euclidean')}")

## 4.3 KMeans Clustering

### 4.3.1 Silhouette score

In [None]:
silhouette_score_list = []
sse_list = []
num_of_clusters=[]
# fig, ax = plt.subplots(2, 2, figsize=(15,8))

for i in range(2,9):
    fig = plt.figure(figsize=(8,8))
    fig.suptitle(f"Silhouette plot for {i} clusters")
    ax = fig.add_subplot()
    print("For Number of Clusters = ", i)
    num_of_clusters.append(i)
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(customers_df)
    labels = kmeans.labels_
    sse_list.append(kmeans.inertia_)
    silhouette_score_list.append(silhouette_score(customers_df, labels, metric='euclidean'))
    print(f"Silhouette Score: {silhouette_score(customers_df, labels, metric='euclidean')}")
    print(f"Sum of Squared Errors: {kmeans.inertia_}")
    print("\n")
    visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick', ax=ax)
    visualizer.fit(customers_df)


### 4.3.2 Sum of Squared Errors (SSE)

In [None]:
kmeans_kneedle  = KneeLocator(
    num_of_clusters,
    sse_list,
    curve='convex',
    direction='decreasing'
)
print(f"Optimal number of clusters by SSE: {int(kmeans_kneedle.knee)}")
kmeans_kneedle.plot_knee()

## 4.4 Hierarchical Clustering 

### 4.4.1 Silhouette score

In [None]:
silhouette_score_list = []
num_of_clusters=[]
for i in range(2,9):
    num_of_clusters.append(i)
    print("For Number of Clusters = ", i)
    model = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='ward')
    model.fit(customers_df)
    labels = model.labels_
    silhouette_score_list.append(silhouette_score(customers_df, labels, metric='euclidean'))
    print(f"Silhouette Score: {silhouette_score(customers_df, labels, metric='euclidean')}")
    print("\n")
kneedle = KneeLocator(
    num_of_clusters,
    silhouette_score_list,
    curve='concave',
    direction='increasing'
    )


print("\n")
print(f"Optimal number of clusters by silhouette score: {int(kneedle.elbow)}")
kneedle.plot_knee()

## 4.5 Evaluation of clustering models

Since the silhouette score given by DBSCAN is so low, we will not be considering it.<br>
KMeans and Hierarchical clustering both gave an optimal cluster size of 5. In terms of silhouette score, both are pretty similiar, with KMeans outperforming Hierarchical clustering by just a bit.<br>
In addition, hierarchical clustering is more computationally expensive so let's go forward with Kmeans

# 5. Understanding customer segments

## 5.1 Preprocessing

In [None]:
kmeans = KMeans(n_clusters=5, random_state=0)
kmeans.fit(customers_df)
labels = kmeans.labels_
original_df["segment"]=labels
original_df.head()

In [None]:
original_df['segment'] = original_df['segment'].map(lambda segment: segment+1)

In [None]:
original_df.head()

In [None]:
original_df.groupby('segment').describe()

## 5.2 Visualizations

### 5.2.1 Distribution of customers across segments

In [None]:
fig = px.pie(original_df, names='segment')
fig.show()

### 5.2.1 Distribution of numerical attributes

In [None]:
def numberdistributiongraph(num_feature):

  fig = px.box(original_df, x="segment", y=num_feature)
  fig.update_layout(
      title_text= (num_feature + " Distribution Chart"),
  )
  fig.show()
    
num_features = ["Age","Annual Income (k$)",	"Spending Score (1-100)"]

for n in num_features:
  numberdistributiongraph(n)

### 5.2.2 Distribution of categorical attributes

In [None]:
import plotly.graph_objects as go
for i in range(1,6):
    male_count = original_df[original_df["segment"]==i].Gender.value_counts()[1]
    female_count = original_df[original_df["segment"]==i].Gender.value_counts()[0]
    fig = go.Figure([go.Bar(x=["Male", "Female"], y=[male_count, female_count])])
    fig.update_layout(
      title_text= (f"Distribution of gender in segment {i}"),
      )
    fig.show()

# 6. Summarizing segment attributes

Putting all the information above into context, let's pen down some thoughts:
- Genders are pretty distributed across the different segments
- Segments 2 and 5 are generally younger than those is segments 1, 3 and 4
- Annual income of customers in segment 3 sits in the middle of the other segments. Segments 1 and 5 are on the low side, while segments 2 and 4 are high
- Customers in segment 2 and 5 are high in terms of spending score, 3 is average, and 1 and 4 are lower than the other segments

Let's organize this in the form of a table

|Segment|Spending|Annual Income|Age (Median)|
|-|-|-|-|
|1|Low|Low|40s|
|2|High|High|Mid 30s|
|3|Middle|Middle|Mid 40s|
|4|Low|High|Low 40s|
|5|High|Low|Low 20s|

# 7. Business Recommendations

Given the following data, we can make the following recommendations for the business to take based on their segments:
<br><br>
<b>Segment 1)</b> Put less resources into marketing and retaining these customers. Given limited resources, they have low potential to spend due to their low annual income, and low likelihood of spending.<br>
<b>Segment 2)</b>  Since they are already spending, put effort into retaining customers that fall within this segment they provide high revenue. This can possibly be done through loyalty schemes/programs.<br>
<b>Segment 3)</b>  They sit right in the middle in therms of spending score and annual income. Perhaps more effort can be put into marketing products or retaining them if the business has the resources.<br>
<b>Segment 4)</b>  These customers have high annual income and low spending. More effort can be put into marketing and attracting them into the business ecosystem.<br>
<b>Segment 5)</b>  Interestingly, these customers are high in spending but low in come, and are quite young as well. They may be students who do not have a job but spend the money of their parents. Since retaining them may be uncertain due to the strictness of their parents, we could focus on product marketing and recommendation.

Thank you and hope you've enjoyed my notebook, please leave feedback if any! I had a lot of fun doing this :)