In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Tasks:**
1. Check missing values, if there then fix that
2. Gender distribution
3. Age distribution
4. Annual income distribution
5. Spending score distributions
6. Check the correlation with spending score
7. Annual income vs spending score, also consideration with Gender
8. Gender vs spending score
9. Age vs spending score considering gender
10. Segmentation of Annual income vs spending score K-Means - Also Include Clusters choosing using Elbow method(Manual and KneeLocator) and Silhoutee Score 
11. Segmentation of Annual income vs spending score DBScan - silhouette_score
12. Segmentation of Age vs spending score K-Means - Also Include Clusters choosing using Elbow method(Manual and KneeLocator) and Silhoutee Score 
13. Segmentation of Age vs spending score DBScan - silhouette_score

**Import necessary libraries**

In [None]:
!pip install --upgrade kneed

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from kneed import KneeLocator, DataGenerator
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN

In [None]:
path="/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv"
data = pd.read_csv(path)
data.head()

In [None]:
# Check columns and its data types
data.info()

In [None]:
# Describe the columns
data.describe()

**1. Check missing values**

In [None]:
data.isna().sum()

**Observation:** There are no null values

**2. Gender distribution**

In [None]:
sns.countplot(x='Gender', data=data)
plt.title("Gender distribution")

**Observation:** There are more female customers as comparison to male. So seems female used to do more shopping but can't conclude from this whether they spend more or not

**3. Age distribution**

In [None]:
data['Age'].describe()

In [None]:
sns.distplot(data['Age'])
plt.title("Age distribution")

In [None]:
sns.boxplot('Age', data=data)
plt.title("Customer Age distribution")

**Observation:** The data is right skewed in which we can see that 75% of our customer lies in the range of 18-50 years. The youngest customer is of 18 year and oldest is having 70 year of age.

**4. Annual income distribution**

In [None]:
data['Annual Income (k$)'].describe()

In [None]:
sns.distplot(data['Annual Income (k$)'])
plt.title("Income distribution")

In [None]:
sns.boxplot(x='Annual Income (k$)', data=data)
plt.title("Annual income distribution")

**Observation:** The data seems to be a bit left skewed where minimum salary earned by the person is 15000 dollars, and the maximum income is of 137000 dollars. Here approximately 75% of the customer are having annual income less than 78000 dollars

**5. Spending score distributions**

In [None]:
data['Spending Score (1-100)'].describe()

In [None]:
sns.distplot(data['Spending Score (1-100)'])
plt.title("Spending Score distribution")

In [None]:
sns.boxplot('Spending Score (1-100)', data=data)
plt.title('Spending Dist')

**Observation:** The score seems to be right skewed

**6. Check the correlation with spending score**

In [None]:
data.drop('CustomerID', axis=1, inplace=True)

In [None]:
#Encode Gender
data['isMale'] = pd.get_dummies(data['Gender'], drop_first=True)

In [None]:
sns.heatmap(data.corr())

In [None]:
sns.pairplot(data)

**Observation:** By looking here we cannot say Spending score is linked to Age, income or gender

**7. Annual Income vs Spending Score with Gender**

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(x='Spending Score (1-100)', y='Annual Income (k$)', hue='Gender', data=data)
plt.title("Income vs Spending")

**Observation:** There seems to be 5 differnt categories of the customers.\
Spending score           |        Income(k in dollars)      |  Majority of Genders\
1-40                     |     15-40                        |   Female\
1-40                     |     70-138                       |   Male\
40-60                    |     40-65                        |   Female\
60-100                   |     10-40                        |   Female\
60-100                   |     70-138                       |   Female
              
Most of the spending score is between 40-60 with the customers who is having annual income between 40-65k dollars. This range is having more female customers as compared to male. We can target the female customers having income in the range of 40-65k dollars. or if in generic way all genders in the range of 40-65k income

**8. Gender vs Spending score**

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(x='Spending Score (1-100)', y='Gender', data=data)
plt.title("Gender wise spending score")

**Observation:** Only for Spending score range from 1-20, there are more Male customer, above 20 score Female customers are in majority.

**9. Age vs spending score considering Gender**

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(x='Spending Score (1-100)', y='Age', hue="Gender", data=data)
plt.title("Age vs Spending score")

**Observation:** There is clear indication that there are only customers in age group 18-40 independent of Gender having spending score above 60. So We should target on the customers in age group 18- 40

**9. Income vs Spending Score K-means clustering**

In [None]:
# Transform the data to get the better result, as we are using the distance based calculations
columns_to_transform = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
data_to_transform = data[columns_to_transform]
for i in columns_to_transform:
    # fit on training data column
    scale = StandardScaler().fit(data_to_transform[[i]])
    
    # transform the training data column
    data_to_transform[i] = scale.transform(data_to_transform[[i]])
    
data_to_transform.head()

## Choose appropriate number of clusters

1. Elbow method - Simple observation, or with Kneelocater
2. Silhoutte coefficient

In [None]:
income_spending_data = data_to_transform[['Annual Income (k$)', 'Spending Score (1-100)']]
income_spending_data

In [None]:
# Elbow method
sse = []
for n in range(1,10):
    kmean = KMeans(n_clusters=n)
    kmean.fit(income_spending_data)
    sse.append(kmean.inertia_)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(1, 10), sse)
plt.xticks(range(1, 10))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

**Observation:** From elbow method we can choose cluster as 5, as after that error is almost constant. But in order to choose the best from elbow method, we can use kneelocator

In [None]:
# Get elbow parameter with KneeLocator
kl = KneeLocator(range(1, 10), sse, curve="convex", direction='decreasing')
kl.elbow

**Knee Locator** It is showing the perfect number of cluster is 4.\

As our analysis and Knew locator, is different by 1 cluster so lets go by 5 then

2. Silhoutte Coefficient

In [None]:
income_spending_data

In [None]:
silhouette_coefficients = []

# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(income_spending_data)
    score = silhouette_score(income_spending_data, kmeans.labels_)
    silhouette_coefficients.append(score)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(2, 11), silhouette_coefficients)
plt.xticks(range(2, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

**Onservation:** As we know that when the Silhoutte coefficient is near to 1 then its best, so here the coefficient value is max at cluster 5, so we will take the number of clusters as 5.

In [None]:
# Lets do clustering with KMean having 5 clusters
kmeans = KMeans(n_clusters=5)
identified_clusters = kmeans.fit_predict(income_spending_data)
data_with_clusters = income_spending_data.copy()
data_with_clusters['Clusters'] = identified_clusters 
plt.scatter(data['Spending Score (1-100)'], data['Annual Income (k$)'],c=data_with_clusters['Clusters'],cmap='rainbow')
plt.xlabel("Spending Score")
plt.ylabel("Annual Income")
plt.title("Spending Score vs Annual Income")

**10. Income vs Spending Score using DBScan**

In [None]:
# Epsilon distance best paramaters
dbscan_silhouette = []
distances=[0.1,0.2,0.3,0.4,0.5,0.6]
for index in range(len(distances)):
    dbscan = DBSCAN(eps=distances[index])
    dbscan.fit(income_spending_data)
    score = silhouette_score(income_spending_data, dbscan.labels_).round(2)
    dbscan_silhouette.append(score)
dbscan_silhouette

In [None]:
plt.style.use("fivethirtyeight")
plt.plot([0.1,0.2,0.3,0.4,0.5,0.6], dbscan_silhouette)
plt.xticks([0.1,0.2,0.3,0.4,0.5,0.6])
plt.xlabel("Epsilon")
plt.ylabel("Silhouette Coefficient")
plt.show()

**Observation** We can see the eps=0.4 gives a good result

In [None]:
# Lets do clustering with DBScan having epsilon as 0.4
dbscan = DBSCAN(eps=0.4)
identified_clusters = dbscan.fit_predict(income_spending_data)

data_with_clusters = income_spending_data.copy()
data_with_clusters['Clusters'] = identified_clusters 
plt.scatter(data['Spending Score (1-100)'], data['Annual Income (k$)'],c=data_with_clusters['Clusters'],cmap='rainbow')
plt.xlabel("Spending Score")
plt.ylabel("Annual Income")
plt.title("Spending Score vs Annual Income")

**Observation:** We can see that the cluster distribution is not good for DBScan over here, the reason is because the dataset is not very densely seperated.

**11. Age vs Spending Score K-means clustering**

In [None]:
# Transform the data to get the better result, as we are using the distance based calculations
columns_to_transform = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
data_to_transform = data[columns_to_transform]
for i in columns_to_transform:
    # fit on training data column
    scale = StandardScaler().fit(data_to_transform[[i]])
    
    # transform the training data column
    data_to_transform[i] = scale.transform(data_to_transform[[i]])
    
data_to_transform.head()

## Choose appropriate number of clusters

1. Elbow method - Simple observation, or with Kneelocater
2. Silhoutte coefficient

In [None]:
age_spending_data = data_to_transform[['Age', 'Spending Score (1-100)']]
age_spending_data

In [None]:
# Elbow method
sse = []
for n in range(1,10):
    kmean = KMeans(n_clusters=n)
    kmean.fit(age_spending_data)
    sse.append(kmean.inertia_)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(1, 10), sse)
plt.xticks(range(1, 10))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

**Observtion:** We can see when the number of cluster is 3, then the Sum of squared error almost converges. So we can take number of cluster as 3. In order to cross validate lets use knee locator 

In [None]:
# Get elbow parameter with KneeLocator
kl = KneeLocator(range(1, 10), sse, curve="convex", direction='decreasing')
kl.elbow

In [None]:
# Lets do clustering with KMean having 3 clusters
kmeans = KMeans(n_clusters=3)
identified_clusters = kmeans.fit_predict(age_spending_data)
data_with_clusters = age_spending_data.copy()
data_with_clusters['Clusters'] = identified_clusters 
plt.scatter(data['Age'], data['Spending Score (1-100)'],c=data_with_clusters['Clusters'],cmap='rainbow')
plt.ylabel("Spending Score")
plt.xlabel("Age")
plt.title("Spending Score vs Age")

**10. Age vs Spending Score using DBScan**

In [None]:
# Epsilon distance best paramaters
dbscan_silhouette = []
distances=[0.1,0.2,0.3,0.4,0.5]
for index in range(len(distances)):
    dbscan = DBSCAN(eps=distances[index])
    dbscan.fit(age_spending_data)
    score = silhouette_score(age_spending_data, dbscan.labels_).round(2)
    dbscan_silhouette.append(score)

plt.style.use("fivethirtyeight")
plt.plot([0.1,0.2,0.3,0.4,0.5], dbscan_silhouette)
plt.xticks([0.1,0.2,0.3,0.4,0.5])
plt.xlabel("Epsilon")
plt.ylabel("Silhouette Coefficient")
plt.show()


**Observation:** Here we can see epsilon value = 0.3, then it needs to be perfect.

In [None]:
# Lets do clustering with DBScan having epsilon as 0.3
dbscan = DBSCAN(eps=0.3)
identified_clusters = dbscan.fit_predict(age_spending_data)

data_with_clusters = age_spending_data.copy()
data_with_clusters['Clusters'] = identified_clusters 
plt.scatter(data['Age'], data['Spending Score (1-100)'],c=data_with_clusters['Clusters'],cmap='rainbow')
plt.ylabel("Spending Score")
plt.xlabel("Age")
plt.title("Spending Score vs Age")

**Observation:** DBScan doesnot seperate the clusters well, as density is less

**Final Conclusion:**\
We should focus on the customers having Age from 20-40, as they used to spend more, if want be more specific then the Age should having annual income in between 40-65k